## Libraries

In [1]:
import pandas as pd 
import numpy as np 
from sklearn import * 
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


  from pandas.core import (


## Helper Functions

In [2]:
# WOE (Weight of Evidence) calculation
def calculate_woe_iv(df, feature, target, bins=5):
    # Bin the feature (quantile-based)
    binned_x = pd.qcut(df[feature], q=bins, duplicates='drop')
    temp_df = pd.DataFrame({'bin': binned_x, 'target': target})

    # Count goods and bads
    grouped = temp_df.groupby('bin'['target'].agg['count', 'sum'])
    grouped.columns = ['total', 'bads']
    grouped['goods'] = grouped['total'] - grouped['bads']

    # Distribution
    total_goods = grouped["goods"].sum()
    total_bads = grouped["bads"].sum()
    grouped["dist_good"] = grouped["goods"] / total_goods
    grouped["dist_bad"] = grouped["bads"] / total_bads

    # WOE and IV
    grouped['woe'] = np.log(grouped['dist_good'] / grouped['dist_bad']).replace([np.inf, -np.inf], 0)
    grouped['iv'] = (grouped["dist_good"] - grouped["dist_bad"]) * grouped["woe"]

    woe_map = grouped["woe"].to_dict()
    iv = grouped["iv"].sum()

    return woe_map, iv


  grouped = temp_df.groupby('bin'['target'].agg['count', 'sum'])


## Clean and Preprocess Data

In [3]:
# Reading the data
df = pd.read_csv("GiveMeSomeCredit/cs-training.csv")
df = df.drop(columns=["Unnamed: 0"], errors = "ignore")
display(df.head())

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
# Check for missing values
print(df.isnull().sum())

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [5]:
# check type
df.dtypes

SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [6]:
# Proportion of target variable
df['SeriousDlqin2yrs'].value_counts(normalize=True)

SeriousDlqin2yrs
0    0.93316
1    0.06684
Name: proportion, dtype: float64

In [7]:
# Splitting data
X = df.drop(columns=['SeriousDlqin2yrs'])
y = df['SeriousDlqin2yrs']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Impute data -> Handling Missing Values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.fit_transform(X_test)
# Converting array to df
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns, index=X_train.index)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)

# Weight of Evidence (WOE) -> not for tree based models
X_train_woe = pd.DataFrame(index=X_train_imputed.index)
X_test_woe = pd.DataFrame(index=X_test_imputed.index)
woe_maps = {}
iv_scores = {}

for col in X_train_imputed.columns:
    try:
        # Calculate WOE map and IV
        woe_map, iv = calculate_woe_iv(X_train_imputed, col, y_train)
        iv_scores[col] = iv
        woe_maps[col] = woe_map

        # Bin and map on train
        train_bins = pd.qcut(X_train_imputed[col], q=5, duplicates='drop')
        X_train_woe[col] = train_bins.map(woe_map)

        # Use the same bin edges on test set
        bin_edges = train_bins.cat.categories
        test_bins = pd.cut(X_test_imputed[col], bins=bin_edges, include_lowest=True)
        X_test_woe[col] = test_bins.map(woe_map)
        
    except Exception as e:
        print(f"Skipping {col}: {e}")


# # Scaling Data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_woe)
# X_test_scaled = scaler.fit_transform(X_test_woe)



Skipping RevolvingUtilizationOfUnsecuredLines: string indices must be integers, not 'str'
Skipping age: string indices must be integers, not 'str'
Skipping NumberOfTime30-59DaysPastDueNotWorse: string indices must be integers, not 'str'
Skipping DebtRatio: string indices must be integers, not 'str'
Skipping MonthlyIncome: string indices must be integers, not 'str'
Skipping NumberOfOpenCreditLinesAndLoans: string indices must be integers, not 'str'
Skipping NumberOfTimes90DaysLate: string indices must be integers, not 'str'
Skipping NumberRealEstateLoansOrLines: string indices must be integers, not 'str'
Skipping NumberOfTime60-89DaysPastDueNotWorse: string indices must be integers, not 'str'
Skipping NumberOfDependents: string indices must be integers, not 'str'
