In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv('./../data/processed/train_2015_2016.csv')
df_valid = pd.read_csv('./../data/processed/valid_2017.csv')

print(f"Training: {len(df_train)} loans")
print(f"Validation: {len(df_valid)} loans")

features_to_bin = [
    'fico_range_low',
    'dti',
    'int_rate',
    'annual_inc',
    'loan_amnt',
    'revol_util',
    'delinq_2yrs',
    'inq_last_6mths'
]

print("\nFeatures to engineer:", features_to_bin)

  df_train = pd.read_csv('./../data/processed/train_2015_2016.csv')


Training: 668182 loans
Validation: 168699 loans

Features to engineer: ['fico_range_low', 'dti', 'int_rate', 'annual_inc', 'loan_amnt', 'revol_util', 'delinq_2yrs', 'inq_last_6mths']


  df_valid = pd.read_csv('./../data/processed/valid_2017.csv')


In [4]:
def calculate_woe_iv(data, feature, target, bins):
    data_copy = data.copy()
    
    if bins == 'categorical':
        data_copy['bin'] = data_copy[feature]
    else:
        data_copy['bin'] = pd.cut(data_copy[feature], bins=bins, duplicates='drop')
    
    grouped = data_copy.groupby('bin', observed=False)[target].agg(['sum', 'count'])
    grouped.columns = ['bad', 'total']
    grouped['good'] = grouped['total'] - grouped['bad']
    
    total_good = grouped['good'].sum()
    total_bad = grouped['bad'].sum()
    
    grouped['good_pct'] = grouped['good'] / total_good
    grouped['bad_pct'] = grouped['bad'] / total_bad
    
    grouped['good_pct'] = grouped['good_pct'].replace(0, 0.0001)
    grouped['bad_pct'] = grouped['bad_pct'].replace(0, 0.0001)
    
    grouped['woe'] = np.log(grouped['good_pct'] / grouped['bad_pct'])
    grouped['iv'] = (grouped['good_pct'] - grouped['bad_pct']) * grouped['woe']
    
    total_iv = grouped['iv'].sum()
    
    return grouped, total_iv

print("WoE calculation function defined")

WoE calculation function defined


In [5]:
binning_scheme = {}

binning_scheme['fico_range_low'] = [0, 660, 670, 680, 690, 700, 710, 720, 750, 850]
binning_scheme['dti'] = [0, 5, 10, 15, 20, 25, 30, 40, 100]
binning_scheme['int_rate'] = [0, 8, 10, 12, 14, 16, 18, 20, 25, 35]
binning_scheme['annual_inc'] = [0, 40000, 50000, 60000, 75000, 90000, 120000, 10000000]
binning_scheme['loan_amnt'] = [0, 5000, 10000, 15000, 20000, 25000, 50000]
binning_scheme['revol_util'] = [0, 20, 40, 60, 80, 100, 200]
binning_scheme['delinq_2yrs'] = [0, 1, 2, 3, 40]
binning_scheme['inq_last_6mths'] = [0, 1, 2, 3, 10]

print("Binning scheme defined for", len(binning_scheme), "features")
for feat, bins in binning_scheme.items():
    print(f"{feat}: {len(bins)-1} bins")

Binning scheme defined for 8 features
fico_range_low: 9 bins
dti: 8 bins
int_rate: 9 bins
annual_inc: 7 bins
loan_amnt: 6 bins
revol_util: 6 bins
delinq_2yrs: 4 bins
inq_last_6mths: 4 bins


In [7]:
feature_names = ['fico_range_low', 'dti', 'int_rate', 'annual_inc', 'loan_amnt', 
                 'revol_util', 'delinq_2yrs', 'inq_last_6mths']

bins_fico = [0, 660, 670, 680, 690, 700, 710, 720, 750, 850]
bins_dti = [0, 5, 10, 15, 20, 25, 30, 40, 100]
bins_int_rate = [0, 8, 10, 12, 14, 16, 18, 20, 25, 35]
bins_income = [0, 40000, 50000, 60000, 75000, 90000, 120000, 10000000]
bins_loan_amt = [0, 5000, 10000, 15000, 20000, 25000, 50000]
bins_revol = [0, 20, 40, 60, 80, 100, 200]
bins_delinq = [0, 1, 2, 3, 40]
bins_inq = [0, 1, 2, 3, 10]

all_bins = [bins_fico, bins_dti, bins_int_rate, bins_income, bins_loan_amt,
            bins_revol, bins_delinq, bins_inq]

for i in range(len(feature_names)):
    feature = feature_names[i]
    bins = all_bins[i]
    woe_table, iv = calculate_woe_iv(df_train, feature, 'default', bins)
    
    print(f"\n{feature}")
    print(f"Information Value: {iv:.4f}")
    print(woe_table[['total', 'bad', 'woe', 'iv']])


fico_range_low
Information Value: 0.1276
             total    bad       woe        iv
bin                                          
(0, 660]     65987  18762 -0.369361  0.014867
(660, 670]  123800  33080 -0.283602  0.016093
(670, 680]  106442  26749 -0.200766  0.006786
(680, 690]   88524  20028 -0.062807  0.000532
(690, 700]   74069  15293  0.053888  0.000317
(700, 710]   59741  10821  0.216247  0.003924
(710, 720]   44990   7292  0.350379  0.007447
(720, 750]   65376   8548  0.601881  0.029505
(750, 850]   39253   3379  1.069982  0.048105

dti
Information Value: 0.0781
            total    bad       woe        iv
bin                                         
(0, 5]      28818   4427  0.414148  0.006536
(5, 10]     82669  12991  0.387283  0.016534
(10, 15]   129972  23045  0.242353  0.010642
(15, 20]   140382  28458  0.077046  0.001220
(20, 25]   119707  27580 -0.086267  0.001366
(25, 30]    88706  23371 -0.264312  0.009969
(30, 40]    75177  23159 -0.483138  0.029787
(40, 100]    257

In [8]:
selected_features = ['int_rate', 'fico_range_low', 'dti', 'annual_inc', 
                     'loan_amnt', 'inq_last_6mths', 'revol_util']
selected_bins = [bins_int_rate, bins_fico, bins_dti, bins_income,
                 bins_loan_amt, bins_inq, bins_revol]

for i in range(len(selected_features)):
    feature = selected_features[i]
    bins = selected_bins[i]
    
    df_train[feature + '_bin'] = pd.cut(df_train[feature], bins=bins, duplicates='drop')
    
    woe_table, _ = calculate_woe_iv(df_train, feature, 'default', bins)
    woe_mapping = woe_table['woe'].to_dict()
    
    df_train[feature + '_woe'] = df_train[feature + '_bin'].map(woe_mapping)

print("WoE features created for training data")
print("\nNew feature names:")
for feat in selected_features:
    print(f"  {feat}_woe")

WoE features created for training data

New feature names:
  int_rate_woe
  fico_range_low_woe
  dti_woe
  annual_inc_woe
  loan_amnt_woe
  inq_last_6mths_woe
  revol_util_woe


In [9]:
for i in range(len(selected_features)):
    feature = selected_features[i]
    bins = selected_bins[i]
    
    df_valid[feature + '_bin'] = pd.cut(df_valid[feature], bins=bins, duplicates='drop')
    
    woe_table, _ = calculate_woe_iv(df_train, feature, 'default', bins)
    woe_mapping = woe_table['woe'].to_dict()
    
    df_valid[feature + '_woe'] = df_valid[feature + '_bin'].map(woe_mapping)

print("WoE features applied to validation data")

missing_train = df_train[[f + '_woe' for f in selected_features]].isna().sum()
missing_valid = df_valid[[f + '_woe' for f in selected_features]].isna().sum()

print("\nMissing values in WoE features:")
print("Training:", missing_train.sum())
print("Validation:", missing_valid.sum())

WoE features applied to validation data

Missing values in WoE features:
Training: 403963
Validation: 102291


In [10]:
print("Missing WoE values by feature:")
print("\nTraining:")
for feat in selected_features:
    missing = df_train[feat + '_woe'].isna().sum()
    print(f"{feat}: {missing}")

print("\nValidation:")
for feat in selected_features:
    missing = df_valid[feat + '_woe'].isna().sum()
    print(f"{feat}: {missing}")

Missing WoE values by feature:

Training:
int_rate: 0
fico_range_low: 0
dti: 181
annual_inc: 0
loan_amnt: 0
inq_last_6mths: 401211
revol_util: 2571

Validation:
int_rate: 0
fico_range_low: 0
dti: 126
annual_inc: 0
loan_amnt: 0
inq_last_6mths: 100933
revol_util: 1232


In [11]:
print("inq_last_6mths distribution:")
print(df_train['inq_last_6mths'].value_counts().sort_index())

print("\nBins defined:", bins_inq)
print("\nValues outside bins:")
print("Max value:", df_train['inq_last_6mths'].max())


inq_last_6mths distribution:
inq_last_6mths
0.0    401210
1.0    178740
2.0     59403
3.0     20496
4.0      6171
5.0      2119
6.0        42
Name: count, dtype: int64

Bins defined: [0, 1, 2, 3, 10]

Values outside bins:
Max value: 6.0


In [12]:
bins_inq_fixed = [-1, 0, 1, 2, 10]
bins_revol_fixed = [-1, 20, 40, 60, 80, 100, 200]
bins_dti_fixed = [-1, 5, 10, 15, 20, 25, 30, 40, 100]

selected_features = ['int_rate', 'fico_range_low', 'dti', 'annual_inc', 
                     'loan_amnt', 'inq_last_6mths', 'revol_util']
selected_bins = [bins_int_rate, bins_fico, bins_dti_fixed, bins_income,
                 bins_loan_amt, bins_inq_fixed, bins_revol_fixed]

for i in range(len(selected_features)):
    feature = selected_features[i]
    bins = selected_bins[i]
    
    df_train[feature + '_bin'] = pd.cut(df_train[feature], bins=bins, duplicates='drop')
    woe_table, _ = calculate_woe_iv(df_train, feature, 'default', bins)
    woe_mapping = woe_table['woe'].to_dict()
    df_train[feature + '_woe'] = df_train[feature + '_bin'].map(woe_mapping)
    
    df_valid[feature + '_bin'] = pd.cut(df_valid[feature], bins=bins, duplicates='drop')
    df_valid[feature + '_woe'] = df_valid[feature + '_bin'].map(woe_mapping)

print("Fixed WoE transformation applied")

missing_train = df_train[[f + '_woe' for f in selected_features]].isna().sum().sum()
missing_valid = df_valid[[f + '_woe' for f in selected_features]].isna().sum().sum()

print(f"\nMissing values - Training: {missing_train}")
print(f"Missing values - Validation: {missing_valid}")

Fixed WoE transformation applied

Missing values - Training: 1
Missing values - Validation: 0


In [13]:
df_train['term_woe'] = 0
df_train.loc[df_train['term'] == ' 60 months', 'term_woe'] = 1

df_valid['term_woe'] = 0
df_valid.loc[df_valid['term'] == ' 60 months', 'term_woe'] = 1

grade_order = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
grade_woe_values = [-1.5, -0.8, -0.2, 0.4, 0.9, 1.3, 1.6]

df_train['grade_woe'] = 0
df_valid['grade_woe'] = 0

for i in range(len(grade_order)):
    grade = grade_order[i]
    woe_val = grade_woe_values[i]
    df_train.loc[df_train['grade'] == grade, 'grade_woe'] = woe_val
    df_valid.loc[df_valid['grade'] == grade, 'grade_woe'] = woe_val

home_rent = df_train[df_train['home_ownership'] == 'RENT']['default'].mean()
home_own = df_train[df_train['home_ownership'] == 'OWN']['default'].mean()
home_mort = df_train[df_train['home_ownership'] == 'MORTGAGE']['default'].mean()

df_train['home_woe'] = 0
df_train.loc[df_train['home_ownership'] == 'RENT', 'home_woe'] = 0.2
df_train.loc[df_train['home_ownership'] == 'OWN', 'home_woe'] = 0.1
df_train.loc[df_train['home_ownership'] == 'MORTGAGE', 'home_woe'] = -0.1

df_valid['home_woe'] = 0
df_valid.loc[df_valid['home_ownership'] == 'RENT', 'home_woe'] = 0.2
df_valid.loc[df_valid['home_ownership'] == 'OWN', 'home_woe'] = 0.1
df_valid.loc[df_valid['home_ownership'] == 'MORTGAGE', 'home_woe'] = -0.1

print("Categorical WoE features created:")
print("  term_woe")
print("  grade_woe")
print("  home_woe")

  df_train.loc[df_train['grade'] == grade, 'grade_woe'] = woe_val
  df_valid.loc[df_valid['grade'] == grade, 'grade_woe'] = woe_val


Categorical WoE features created:
  term_woe
  grade_woe
  home_woe


  df_train.loc[df_train['home_ownership'] == 'RENT', 'home_woe'] = 0.2
  df_valid.loc[df_valid['home_ownership'] == 'RENT', 'home_woe'] = 0.2


In [14]:
woe_features = [
    'int_rate_woe',
    'fico_range_low_woe',
    'dti_woe',
    'annual_inc_woe',
    'loan_amnt_woe',
    'inq_last_6mths_woe',
    'revol_util_woe',
    'term_woe',
    'grade_woe',
    'home_woe'
]

print(f"Total WoE features: {len(woe_features)}")
print("\nFeature list:")
for i, feat in enumerate(woe_features):
    print(f"{i+1}. {feat}")

print("\nChecking missing values in final features:")
print("Training:", df_train[woe_features].isna().sum().sum())
print("Validation:", df_valid[woe_features].isna().sum().sum())

X_train = df_train[woe_features + ['default']].copy()
X_valid = df_valid[woe_features + ['default']].copy()

X_train = X_train.dropna()
X_valid = X_valid.dropna()

print(f"\nFinal dataset sizes after dropping NA:")
print(f"Training: {len(X_train)}")
print(f"Validation: {len(X_valid)}")

X_train.to_csv('./../data/processed/train_woe_features.csv', index=False)
X_valid.to_csv('./../data/processed/valid_woe_features.csv', index=False)

print("\nSaved processed datasets with WoE features")

Total WoE features: 10

Feature list:
1. int_rate_woe
2. fico_range_low_woe
3. dti_woe
4. annual_inc_woe
5. loan_amnt_woe
6. inq_last_6mths_woe
7. revol_util_woe
8. term_woe
9. grade_woe
10. home_woe

Checking missing values in final features:
Training: 1
Validation: 0

Final dataset sizes after dropping NA:
Training: 668181
Validation: 168699

Saved processed datasets with WoE features
