# Feature Engineering Enhancement
This notebook improves data preprocessing with smart fillna, IV-based variable selection, and better binning.

In [9]:
# Imports
import pandas as pd
import numpy as np
import scorecardpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import os

## 1. Load Data

In [10]:
df = pd.read_csv('../data/cs-training.csv', index_col=0)
df = df.replace(-999, np.nan)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## 2. Smart Fillna Function

In [11]:
def smart_fillna(df, method='median'):
    df = df.copy()
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if method == 'median':
                df[col] = df[col].fillna(df[col].median())
            elif method == 'zero':
                df[col] = df[col].fillna(0)
    return df

df_filled = smart_fillna(df)

## 3. WOE Binning & IV Analysis

In [12]:
bins = sc.woebin(df_filled, y='SeriousDlqin2yrs')
woe_df = sc.woebin_ply(df_filled, bins)

# Calculate IV
iv_list = []
for var in bins.keys():
    iv = bins[var]['total_iv'].values[0]
    iv_list.append({'variable': var, 'info_value': iv})

iv_df = pd.DataFrame(iv_list).sort_values(by='info_value', ascending=False)

iv_df[['variable', 'info_value']].head()

iv_df[['variable', 'info_value']].sort_values(by='info_value', ascending=False)

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  import pkg_resources
  import pkg_resources
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
Length: 150000
Categories (46, object): ['[-inf,2.0)' < '[2.0,22.0)' < '[22.0,24.0)' < '[24.0,26.0)' < ... <
                          '[102.0,104.0)' < '[104.0,106.0)' < '[106.0,108.0)' < '[108.0,inf)']' has dtype incompatible with category, please explicitly cast to a compatible dtype first.
  dtm.loc[:,'bin'] = pd.cut(dtm['value'], bstbrks, right=False, labels=labels)
  binning = dtm.groupby(['variable','bin'], group_keys=False)['y'].agg([n0, n1])\
  .agg({'good':sum, 'bad':sum}

[INFO] converting into woe values ...


  import pkg_resources
  import pkg_resources


Unnamed: 0,variable,info_value
5,RevolvingUtilizationOfUnsecuredLines,1.073162
1,NumberOfTimes90DaysLate,0.837551
3,NumberOfTime30-59DaysPastDueNotWorse,0.740481
9,NumberOfTime60-89DaysPastDueNotWorse,0.572373
2,age,0.246913
7,NumberOfOpenCreditLinesAndLoans,0.075373
0,DebtRatio,0.070047
6,MonthlyIncome,0.069052
4,NumberRealEstateLoansOrLines,0.055354
8,NumberOfDependents,0.033818


## 4. Variable Selection by IV Threshold

In [14]:
# Select variables with IV > 0.02
selected_vars = iv_df[iv_df['info_value'] > 0.02]['variable'].tolist()
woe_cols = [f"{var}_woe" for var in selected_vars]
woe_selected = woe_df[woe_cols + ['SeriousDlqin2yrs']]
woe_selected.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines_woe,NumberOfTimes90DaysLate_woe,NumberOfTime30-59DaysPastDueNotWorse_woe,NumberOfTime60-89DaysPastDueNotWorse_woe,age_woe,NumberOfOpenCreditLinesAndLoans_woe,DebtRatio_woe,MonthlyIncome_woe,NumberRealEstateLoansOrLines_woe,NumberOfDependents_woe,SeriousDlqin2yrs
1,0.578018,-0.389724,1.901126,-0.288208,0.100377,-0.128983,0.630507,-0.271599,0.253629,0.209307,1
2,1.355364,-0.389724,-0.541721,-0.288208,0.449516,-0.128983,-0.135931,0.292569,0.23597,0.102578,0
3,0.578018,2.298734,0.903654,-0.288208,0.449516,0.727837,-0.135931,0.292569,0.23597,-0.15006,0
4,-0.40703,-0.389724,-0.541721,-0.288208,0.449516,-0.128983,-0.135931,0.292569,0.23597,-0.15006,0
5,1.355364,-0.389724,0.903654,-0.288208,0.100377,-0.128983,-0.135931,-0.459905,-0.256641,-0.15006,0


## 5. Save Transformed Data and Bins

In [16]:
os.makedirs('../outputs', exist_ok=True)
woe_selected.to_csv('../outputs/woe_train_data.csv', index=False)

import json
bins_dict = {k: v.to_dict(orient="list") for k, v in bins.items()}
with open('../outputs/woe_bins.json', 'w') as f:
    json.dump(bins_dict, f)
print('Files saved.')

Files saved.
