# Step01: Data split

In [None]:
# helper functions form WF MDD (split datasets, VIF, WOE, mono bin)

def mono_bin(temp_df, feature, target, n=10):
    # rho for spearman correlation
    custom_rho = 1
    r = 0
    
    while np.abs(r) < custom_rho and n > 1:
        try:
            #quantile discretizer custs data into equal number of bins
            qds = QuantileDiscretizer(numBuckets = n, inputCol = feature, outputCol = 'buckets',
                                     relativeError = 0.01, handleInvalid = 'error')
            temp_df = qds.setHandleInvalid('keep').fit(temp_df).transform(temp_df)  #keep NAN
            
            #create corr_df is Python implemented
            corr_df = temp_df.groupby('buckets').agg({feature: 'avg', target: 'avg'}).toPandas()
            corr_df.columns = ['buckets', feature, target]
            r,p = stats.spearmanr(corr_df[feature], corr_df[target])
            n = n-1
            
        except Exception as e:
            n = n-1
            
        return temp_df
    

    
# transform in WOE
def fit_woe_on_training(path, df, target, label):
    lst_tbl, lst_iv = list(),list()
    max_bin = 0
    
    count = -1
    for feature in final_vars:
        print("feature is:", feature)
        #execute if feature is not a target col
        if feature != target:
            count = count + 1
            temp_df = df.select([feature,target])   #spark
            
            #perform monotonic binning
            if feature in num_vars:
                temp_df = mono_bin(temp_df,feature,target,n=max_bin)
                #numeric values
                grouped = temp_df.groupby('buckets')
            else:
                #categorical
                grouped = temp_df.groupby(feature)
                
            #count and event value for each group
            count_df = grouped.agg(F.count(target).alias('count')).toPandas()
            event_df = grouped.agg(F.sum(target).alias('event')).toPandas()
            
            #store min/max for variables
            if feature in num_vars:
                min_value = grouped.agg(F.min(feature).alias('min')).toPandas()['min']
                max_value = grouped.agg(F.max(feature).alias('max')).toPandas()['max']
            else:
                min_value = count_df[feature]
                max_value = count_df[feature]
                
                
            #calculate WOE and IV
            temp_woe_df = calculate_woe(count_df, event_df, min_value, max_value, feature)
            
            #sort by min value and keep increasing order
            temp_woe_df.sort_values(by = 'min_value', inplace=True)
            temp_woe_df.reset_index(inplace=True)
            temp_woe_df.drop(['index'], axis=1, inplace=True)
            temp_woe_df.reset_index(inplace=True)
            temp_woe_df.rename(columns={'index':'bin'},inplace=True)
                
            #mapping tavble between bin number and WOE
            temp_woe_df['bin_adjust'] = np.where(temp_woe_df['min_value'].isna(), -999, temp_woe_df['bin'])
            
            #separate IV dataset
            iv = pd.DataFrame({'IV': temp_woe_df.groupby('varname').tot_iv.max()})
            iv['predictive_ind'] = np.where(iv['IV']>= 0.02, 1, 0)
            iv = iv.reset_index()
            
            # ----------------------------------
            # save table for each predictor
            # ----------------------------------
            woe_iv_dict = temp_woe_df.to_dict()
            lst_tbl.append(woe_iv_dict)
            
            iv_dict = iv.to_dict()
            lst_iv.append(iv_dict)
            
        # ---------------------------------
        # save table for each predictor
        # ---------------------------------
        filename = (path + label + '_woe_tbl_FIT')
        pickle.dump(lst_tbl, open(filename,'wb'))
        print("WOE table saved")
        
        filename = (path + label + '_iv_tbl_FIT')
        pickle.dump(lst_iv, open(filename,'wb'))
        print("IV table saved")

## Read Hive

In [None]:
phys = spark.sql(""" select * from table """)
phys = phys.filter( (F.col('week_n') >= 18) )
phys.createOrReplaceTempView('phys')

print("number of obs:", phys.count())
print("number of cols:", len(phys.columns))

stats = phys.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('target')).alias('tot pos'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('target'))/F.count('*')).alias('target rate')).orderBy('week_n')
                
df = stats.toPandas()

#fill missing
phys = phys.na.fill(value=0)

## Split into segments

In [None]:
# Hospital
df_hosp_manual = phys.filter( (F.col('hospital_ind') == 1) & (F.col('auto_adj') == 0) )
print("Hospital manual")

# save parquet
df_hosp.write.partitionBy('week_n').mode('overwrite').parquet(path + 'segment1')

## Split into train/out-time

In [None]:
seg_lab = ['seg1','seg2','seg3','seg4']

for s in seg_lab:
    print(s)
    seg = spark.read.parquet(path + s + '.parquet')
    train = seg.filter( (F.col('week_n') >= 18) & (F.col('week_n') <= 27) )
    val = seg.filter( (F.col('week_n') >= 28) & (F.col('week_n') <= 30) )
    
    print("training")
    print("number of obs:", train.count())
    print("number of cols:", len(train.columns))
    print()
    
    # save parquet
    train.write.partitionBy('week_n').mode('overwrite').parquet(path + s + 'train_cohort.parquet')
    print("training has been saved")

## Split into in-time train/val

In [None]:
seg_lab = ['seg1','seg2','seg3','seg4']

for s in seg_lab:
    print(s)
    
    #read saved training
    seg = spark.read.parquet(path + s + 'train_cohort.parquet')
    
    split_datasets(sdf_input = seg,
                  path = path,
                  train_size = 0.80,
                  test_size = 0.20,
                  s = s)
    print()

## Read datasets

In [None]:
seg_lab = ['seg1','seg2','seg3','seg4']

for s in seg_lab:
    print("***************************")
    print(s)
    # --------------------------
    # training
    # --------------------------
    trn_sdf = spark.read.parquet(path + s + 'in_smpl_trainset.parquet')
    print("number of obs:", trn_sdf.count())
    print("number of cols:", len(trn_sdf.columns))
    stats = phys.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('target')).alias('tot pos'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('target'))/F.count('*')).alias('target rate')).orderBy('week_n').toPandas()
    print(stats)
    print()