# Step 03: Feature selection using IV

In [None]:
# helper functions

s = 'seg4'

trn_sdf = spark.read.parquet(path + s + 'in_smpl_trainset.parquet')

print("number of obs:", trn_sdf.count())
print("number of cols:", len(trn_sdf.columns))

stats = trn_sdf.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('target')).alias('tot pos'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('target'))/F.count('*')).alias('target rate')).orderBy('week_n').toPandas()
print(stats)

## map procedures

In [None]:
cols_order = []

# --------------------------------
# procedures
# --------------------------------
proc_df = pd.read_csv(filepath + 'proc_map.csv')
lst = proc_df['proc'].values.tolist()
dict_proc = dict(zip(lst,lst))

#-------------------------
# map of training set
# ------------------------
mapping_expr = create_map([lit(x) for x in chain(*dict_proc.items())])
trn_sdf = trn_sdf.withColumn('proc_desc_n', mapping_expr[trn_sdf['proc_desc']])
trn_sdf = trn_sdf.na.fill('OTHER',['proc_desc_n'])
trn_sdf = trn_sdf.drop('proc_desc')

#reorder cols
trn2 = trn_sdf.select(*cols_order)

# stats
print("number of obs:", trn_sdf.count())
print("number of cols:", len(trn_sdf.columns))

## Take a sample in order to run IV

In [None]:
sdf_smpl = trn2.sampleBy('target',
                        fractions={1:1.0,  #100% minority
                                  0:0.05}, #5% majority
                                  seed=542)

sdf_smpl.groupBy('target').count().show()

In [None]:
df_smpl = sdf_smpl.toPandas()
df_smpl.head()

## Categorize data types

In [None]:
# ---------------------------
# numeric
# ---------------------------
num_vars = list(df.select_dtypes(include = ['int32', 'int64', 'float32', 'float64']).columns)
num_vars.remove('call_ind')
print('There are', len(num_vars), 'numeric features in the list')
print(num_vars)
print()

# -------------------
# categorical
# -------------------
cat_vars = list(df.select_dtypes(include = ['object']).columns)

# -----------------
# full list
# ------------------
full_lst = list(df.columns)
print("there are", len(full_lst), "total feat in the list")

# ---------------------
# prepare all cols
# ----------------------
final_vars = num_vars + cat_vars
print("there are", len(final_vars))

## run info value

In [None]:
#convert pandas into dataframe
sdf = spark.createDataFrame(df_smpl)

#run WOE transformation
fit_woe_on_training(path = filepath,
                   df = sdf,
                   target = 'target',
                   label = 'train')

## download IV

In [None]:
filename = (filepath + 'train_woe_tbl_FIT')
woe_lst = pickle.load(open(filename,'rb'))

if os.path.isfile(filepath + 'train_woe_FIT.csv'):
    os.remove(filepath + 'train_woe_FIT.csv')
    
for df in woe_lst:
    tables_b = pd.DataFrame.from_dict(df, orient = 'columns')
    with open(filepath + 'train_woe_FIT.csv', 'a') as f:
        tables_b.to_csv(f)
        f.write("\n")
print("CSV file complete")

## get selected variables from IV

In [None]:
filename = (filepath + 'train_iv_tbl_FIT')
iv_lst = pickle.load(open(filename,'rb'))
hold_df = pd.DataFrame()

for i in range(len(iv_lst)):
    tables_b = pd.DataFrame.from_dict(iv_lst[i], orient = 'columns')
    hold_df = pd.concat([tables_b, hold_df], axis=0)
    
hold_df.sort_values(by = ['IV'], ascending = False, inplace=True)
hold_df.to_csv(filepath + 'IV.csv')

#filter on predicted IV only
hold_df = hold_df[hold_df['predictive_ind']==1]
cols_keep = hold_df['varname'].values.tolist()
cols_keep.insert(0,'target')
print('there are', ;len(hold_df), 'predictive variables based on IV>= 0.02')
print(cols_keep)

## variance inflation factor

In [None]:
df_to_check_vif = df_smpl[num_vars]
cols = df_to_check_vif.columns.tolist()
print('there are',len(cols),'numeric cols to process')

#run VIF
df_vif = calculate_vif(path = filepath,
                      name = 'eda',
                      df = df_to_check_vif,
                      drop_cols = False,
                      thresh = 5)

In [None]:
#read csv file
vif_file = pd.read_csv(filepath + 'eda_VIF.csv')
vif_file.head(30)

## apply selected attributes and save datasets

In [None]:
# add PK to list
pk_lst = ['week_n',
         'prov_tin']

cols_keep.extend(pk_lst)
print(cols_keep)
print(len(cols_keep))

In [None]:
# variables with high VIF
VIF_drop = ['pct_paid','clm_allow']

#training
trn_n = trn.sdf.select(*cols_order)
trn_n = trn_n.select(*cols_keep)
trn_n = trn_n.drop(*VIF_drop)

#write parquet
trn_n.write.partitionBy('week_n').mode('overwrite').parquet(path + 'train_feat_sel.parquet')

In [None]:
# stats
print("number of obs:", trn_n.count())
print("number of cols:", len(trn_n.columns))