# Step05 - Scoring

In [None]:
from pyspark.ml.feature import Bucketizer

### read scoring dataset

In [None]:
target = 'target'
model = 'xgboost'
pk_lst = ['id']

oot_sdf = spark.read.parquet(path + 'data' + model + '.parquet')
oot_sdf = oot_sdf.withColumnRenamed(target,'target')

print("number of cols:", len(oot_sdf.columns))
oot_sdf.agg(F.sum(F.col('target')).alias('tot pos'),
           F.count('*').alias('tot rows'),
            (F.sum(F.col('target'))/F.count('*')).alias('target rate')).show()

#convert to pandas for scoring
oot = oot_sdf.toPandas()
print("scoring data converted into pandas")
oot.head()

### Load pre-requisite files (list of features, decile cutoffs, and optimal threshold)

In [None]:
# =============================
# A. list of model features
# =============================
filename = (path + model + 'feat_lst')
lst = pickle.load(open(filename,'rb'))
print("list of features loaded")
print(lst)

# ===========================================
# B. raw prob cutofds to assign deciles
# ===========================================
filename = (path + model + '_train_decile_cutoffs')
lst = pickle.load(open(filename,'rb'))
print("cutoffs loaded")

# ===========================================
# C. optimal prob threshold for classification
# ===========================================
pr_thresh = 0.54212

### Load models as pickle files

In [None]:
# ===============================================
# Load uncalibrated model 
# ===============================================
print("loading uncalibrated model")
saved_clf = load_model(path = path, name = model)

# ====================================
# Load calibrated model
# ====================================
print("loading calibrated model")
saved_clf_cal = load_model(path = path, name = model + '_platt')

### dataframe for scoring

In [None]:
scrn_data = pd.concat([pd.DataFrame(oot[pk]),
                       pd.DataFrame(saved_clf.predict_proba(oot[lst].values)[:,1],columns=['praw']),
                       pd.DataFrame(saved_clf_cal.predict(saved_clf.predict_proba(oot[lst].values)[:,1]),columns=['pcal']),
                       pd.DataFrame(oot['target'],columns=['target'])],axis=1)

#predicted class assignment based on optimal threshold
scrn_data['pclass'] = np.where(scrn_data['praw'] > pr_thresh,1,0)
print("out-time predictions complete")
scrn_data.head()

### assign deciles based on training set

In [None]:
#read scoring as spark dataset
scrn_data = spark.read.parquet(path + 'scored_file.parquet')

In [None]:
decile_map = {0:10,
              1:9,
              2:8,
              3:7,
              4:6,
              5:5,
              6:4,
              7:3,
              8:2,
              9:1}

#read spark dataframe using raw "p1"
bucketizer = Bucketizer(splits = cutoffs, inputCol = 'praw', outoutCol = 'buckets')   #raw prob on imbalanced set
scrn_data_n = bucketizer.setHandleInvalid('keep').transform(scrn_data)

#map to training decile cutoffs
pdf = scrn_data_n.toPandas()
pdf['buckets'] = pdf['buckets'].map(decile_map)
pdf.head()

## ===================================
## Stats 
## ===================================

In [None]:
df = scrn_data_n.groupBy('buckets')\
                .agg(F.mean(F.col('target')).alias('actual'),
                    F.sum(F.col('target')).alias('tot events'),
                     F.mean(F.col('pcal')).alias('cal_prob'),
                     F.min(F.col('pcal')).alias('min_cal_prob'),
                     F.max(F.col('pcal')).alias('max_cal_prob'),
                     F.count(F.lit(1)).alias('count'))
pdf = df.toPandas()
pdf = pdf.sort_values(by = 'buckets', ascending = False)
pdf['buckets'] = pdf['buckets'].map(decile_map)
pdf.head(11)

In [None]:
#add pclass
df = scrn_data_n.groupBy(['buckets','pclass'])\
                .agg(F.mean(F.col('target')).alias('actual'),
                    F.sum(F.col('target')).alias('tot events'),
                     F.mean(F.col('pcal')).alias('cal_prob'),
                     F.min(F.col('pcal')).alias('min_cal_prob'),
                     F.max(F.col('pcal')).alias('max_cal_prob'),
                     F.count(F.lit(1)).alias('count'))
pdf = df.toPandas()
pdf = pdf.sort_values(by = 'buckets', ascending = False)
pdf['buckets'] = pdf['buckets'].map(decile_map)
pdf.head(11)

In [None]:
pdf.pivot(index = 'buckets', columns = 'pclass', values = ['count'])