In [1]:
### Predict which customer will take a loan
#!pip install xgboost 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import xgboost as xgb
import seaborn as sns


%matplotlib inline

In [3]:
# Loading the dataset
case_study1 = pd.read_csv('D:\\D Drive Data\\Companies\\DMI\\Modelling Case Study\\finfast_100k_dataset.csv')  





In [4]:
case_study1.shape #100000 18





(100000, 18)

In [5]:
# check if any missing values
#input_data.isnull().sum()
case_study1.columns[case_study1.isnull().any()]#none


Index(['gender', 'credit_score', 'device_type', 'text_complaint'], dtype='object')

In [6]:
case_study1['gender'] = case_study1['gender'].fillna('Unknown')
case_study1['device_type'] = case_study1['device_type'].fillna('Unknown')
case_study1['text_complaint'] = case_study1['employment_type'].fillna('Unknown')


# Fix negative values in days_since_last_txn
case_study1['days_since_last_txn_credit_repayment_24month'] = \
    case_study1['days_since_last_txn_credit_repayment_24month'].clip(lower=0)

num_cols = [
    'nb_txn_cnt',
    'credit_score'
]

for col in num_cols:
    case_study1[col] = case_study1[col].fillna(case_study1[col].median())


In [7]:
# check if any missing values
#input_data.isnull().sum()
case_study1.columns[case_study1.isnull().any()]#none


Index([], dtype='object')

In [8]:
case_study1['city'].value_counts(dropna=False)


city
Kolkata      11273
Pune         11232
Bangalore    11183
Mumbai       11097
Hyderabad    11084
Bengaluru    11076
Chennai      11074
Delhi        11027
Jaipur       10954
Name: count, dtype: int64

In [9]:
# Standardize city names
case_study1['city'] = case_study1['city'].str.strip().str.lower()

# Replace variants
case_study1['city'] = case_study1['city'].replace({
    'bengaluru': 'bengaluru',
    'bangalore': 'bengaluru'
})




In [10]:
case_study1['city'].value_counts(dropna=False)


city
bengaluru    22259
kolkata      11273
pune         11232
mumbai       11097
hyderabad    11084
chennai      11074
delhi        11027
jaipur       10954
Name: count, dtype: int64

In [11]:
case_study1.shape

(100000, 18)

In [12]:
case_study1['repeat_loan'].value_counts(dropna=False)


repeat_loan
0    69930
1    30070
Name: count, dtype: int64

In [13]:
case_study1

Unnamed: 0,customer_id,age,gender,city,income,employment_type,credit_score,loan_amount,approved_amount,application_date,registration_timestamp,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,application_channel,loan_purpose,device_type,text_complaint,repeat_loan
0,C15796,30,Female,delhi,35,Other,414.0,59594,18152,08/07/24,15/06/21,28,74,Paid,Education,Unknown,Other,0
1,C861,32,Male,delhi,107578,Other,618.0,43717,17401,15/04/23,29/04/21,64,375,Referral,Medical,Unknown,Other,0
2,C76821,25,Unknown,bengaluru,117556,Other,348.0,12459,49637,19/02/22,24/12/22,9,259,Referral,Education,Other,Other,0
3,C54887,40,Unknown,hyderabad,135867,Salaried,881.0,35569,36527,23/10/22,30/08/22,6,348,Organic,Education,Other,Salaried,0
4,C6266,18,Female,chennai,131481,Salaried,840.0,45459,57327,04/06/24,25/06/24,8,0,Referral,Education,Unknown,Salaried,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,C19663,12,Female,chennai,120733,Self-Employed,888.0,46842,5650,26/01/23,06/01/23,6,260,Referral,Education,Unknown,Self-Employed,0
99996,C82489,67,Male,bengaluru,120707,Other,680.0,44901,44108,16/08/23,23/04/23,24,303,Organic,Other,Other,Other,1
99997,C50501,78,Unknown,delhi,66,Self-Employed,715.0,17995,54515,19/03/22,05/05/21,10,18,Organic,Education,iOS,Self-Employed,0
99998,C39711,59,Unknown,mumbai,68817,Self-Employed,675.0,28929,50363,05/08/23,04/11/23,28,487,Referral,Other,Unknown,Self-Employed,0


In [14]:
#without reject inferencing
#filter  where default 
input_data = case_study1.copy()



In [15]:
#check shape of new dataset
input_data.shape

(100000, 18)

In [16]:
input_data1 = input_data.rename(columns={'repeat_loan': 'Default'})


if 'repeat_loan' in input_data1.columns:
    input_data1 = input_data1.drop('repeat_loan', axis=1)

In [17]:
input_data1.shape

(100000, 18)

In [18]:
input_data1.head()

Unnamed: 0,customer_id,age,gender,city,income,employment_type,credit_score,loan_amount,approved_amount,application_date,registration_timestamp,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,application_channel,loan_purpose,device_type,text_complaint,Default
0,C15796,30,Female,delhi,35,Other,414.0,59594,18152,08/07/24,15/06/21,28,74,Paid,Education,Unknown,Other,0
1,C861,32,Male,delhi,107578,Other,618.0,43717,17401,15/04/23,29/04/21,64,375,Referral,Medical,Unknown,Other,0
2,C76821,25,Unknown,bengaluru,117556,Other,348.0,12459,49637,19/02/22,24/12/22,9,259,Referral,Education,Other,Other,0
3,C54887,40,Unknown,hyderabad,135867,Salaried,881.0,35569,36527,23/10/22,30/08/22,6,348,Organic,Education,Other,Salaried,0
4,C6266,18,Female,chennai,131481,Salaried,840.0,45459,57327,04/06/24,25/06/24,8,0,Referral,Education,Unknown,Salaried,1


In [19]:
#filter  where default is not null
input_data1 = input_data1[input_data1["Default"].notnull()]

In [20]:
input_data1.shape

(100000, 18)

In [21]:
input_data1.head()

Unnamed: 0,customer_id,age,gender,city,income,employment_type,credit_score,loan_amount,approved_amount,application_date,registration_timestamp,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,application_channel,loan_purpose,device_type,text_complaint,Default
0,C15796,30,Female,delhi,35,Other,414.0,59594,18152,08/07/24,15/06/21,28,74,Paid,Education,Unknown,Other,0
1,C861,32,Male,delhi,107578,Other,618.0,43717,17401,15/04/23,29/04/21,64,375,Referral,Medical,Unknown,Other,0
2,C76821,25,Unknown,bengaluru,117556,Other,348.0,12459,49637,19/02/22,24/12/22,9,259,Referral,Education,Other,Other,0
3,C54887,40,Unknown,hyderabad,135867,Salaried,881.0,35569,36527,23/10/22,30/08/22,6,348,Organic,Education,Other,Salaried,0
4,C6266,18,Female,chennai,131481,Salaried,840.0,45459,57327,04/06/24,25/06/24,8,0,Referral,Education,Unknown,Salaried,1


In [22]:
input_data1.Default.value_counts()

Default
0    69930
1    30070
Name: count, dtype: int64

In [23]:
#converting dependant variable to binary
input_data1['Default'] = input_data1['Default'].apply(lambda x: 1 if x > 0 else 0)



In [24]:
#create dummy variables
#find string columns

# Select only string (object) columns
string_columns = input_data1.select_dtypes(include=['object']).columns

print("String variables in dataset:")
print(string_columns.tolist())




String variables in dataset:
['customer_id', 'gender', 'city', 'employment_type', 'application_date', 'registration_timestamp', 'application_channel', 'loan_purpose', 'device_type', 'text_complaint']


In [25]:
# Drop date columns from input_data1
input_data1 = input_data1.drop(
    ["registration_timestamp", "application_date"],
    axis=1
)

# Verify
print(input_data1.columns)

Index(['customer_id', 'age', 'gender', 'city', 'income', 'employment_type',
       'credit_score', 'loan_amount', 'approved_amount', 'nb_txn_cnt',
       'days_since_last_txn_credit_repayment_24month', 'application_channel',
       'loan_purpose', 'device_type', 'text_complaint', 'Default'],
      dtype='object')


In [26]:
#create dummy variables
#find string columns

# Select only string (object) columns
string_columns = input_data1.select_dtypes(include=['object']).columns

print("String variables in dataset:")
print(string_columns.tolist())




String variables in dataset:
['customer_id', 'gender', 'city', 'employment_type', 'application_channel', 'loan_purpose', 'device_type', 'text_complaint']


In [27]:
input_data1.shape

(100000, 16)

In [28]:
#check unique vakues in these strings
unique_counts = input_data1[string_columns].nunique(dropna=False).reset_index()
unique_counts.columns = ["Column", "Unique_Values"]

print(unique_counts)

                Column  Unique_Values
0          customer_id          60363
1               gender              3
2                 city              8
3      employment_type              3
4  application_channel              3
5         loan_purpose              4
6          device_type              4
7       text_complaint              3


In [29]:
# Remove customer_id from string_columns
string_columns_filtered = [
    col for col in string_columns if col != "customer_id"
]

# Recalculate unique counts
unique_counts = (
    input_data1[string_columns_filtered]
    .nunique(dropna=False)
    .reset_index()
)

unique_counts.columns = ["Column", "Unique_Values"]

print(unique_counts)

                Column  Unique_Values
0               gender              3
1                 city              8
2      employment_type              3
3  application_channel              3
4         loan_purpose              4
5          device_type              4
6       text_complaint              3


In [30]:
string_columns

Index(['customer_id', 'gender', 'city', 'employment_type',
       'application_channel', 'loan_purpose', 'device_type', 'text_complaint'],
      dtype='object')

In [31]:
# Create dummy variables for string columns
#input_data2 = pd.get_dummies(input_data1, columns=string_columns, drop_first=True)
#input_data2 = pd.get_dummies(input_data1[string_columns], drop_first=True).astype(int)
#input_data2 = pd.get_dummies(input_data1, columns=string_columns, drop_first=True).astype(int)
input_data2 = pd.get_dummies(input_data1, columns=string_columns_filtered, drop_first=True, dtype=int)

In [32]:
input_data2.head()

Unnamed: 0,customer_id,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,Default,gender_Male,...,application_channel_Paid,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed
0,C15796,30,35,414.0,59594,18152,28,74,0,0,...,1,0,0,0,0,0,1,0,0,0
1,C861,32,107578,618.0,43717,17401,64,375,0,1,...,0,1,1,0,0,0,1,0,0,0
2,C76821,25,117556,348.0,12459,49637,9,259,0,0,...,0,1,0,0,0,1,0,0,0,0
3,C54887,40,135867,881.0,35569,36527,6,348,0,0,...,0,0,0,0,0,1,0,0,1,0
4,C6266,18,131481,840.0,45459,57327,8,0,1,0,...,0,1,0,0,0,0,1,0,1,0


In [33]:
input_data2.to_csv('D:\\D Drive Data\\Companies\\DMI\\Modelling Case Study\\finfast_100k_dataset1.csv', index=True)  


In [34]:
input_data2.head()

Unnamed: 0,customer_id,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,Default,gender_Male,...,application_channel_Paid,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed
0,C15796,30,35,414.0,59594,18152,28,74,0,0,...,1,0,0,0,0,0,1,0,0,0
1,C861,32,107578,618.0,43717,17401,64,375,0,1,...,0,1,1,0,0,0,1,0,0,0
2,C76821,25,117556,348.0,12459,49637,9,259,0,0,...,0,1,0,0,0,1,0,0,0,0
3,C54887,40,135867,881.0,35569,36527,6,348,0,0,...,0,0,0,0,0,1,0,0,1,0
4,C6266,18,131481,840.0,45459,57327,8,0,1,0,...,0,1,0,0,0,0,1,0,1,0


In [35]:
input_data2.shape

(100000, 30)

In [36]:
input_data2.head()

Unnamed: 0,customer_id,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,Default,gender_Male,...,application_channel_Paid,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed
0,C15796,30,35,414.0,59594,18152,28,74,0,0,...,1,0,0,0,0,0,1,0,0,0
1,C861,32,107578,618.0,43717,17401,64,375,0,1,...,0,1,1,0,0,0,1,0,0,0
2,C76821,25,117556,348.0,12459,49637,9,259,0,0,...,0,1,0,0,0,1,0,0,0,0
3,C54887,40,135867,881.0,35569,36527,6,348,0,0,...,0,0,0,0,0,1,0,0,1,0
4,C6266,18,131481,840.0,45459,57327,8,0,1,0,...,0,1,0,0,0,0,1,0,1,0


In [37]:
input_data2.shape

(100000, 30)

In [38]:
#input_data['30_DPD_F3M']['Default'].value_counts()
pd.crosstab(input_data2["credit_score"], input_data2["Default"])

Default,0,1
credit_score,Unnamed: 1_level_1,Unnamed: 2_level_1
300.0,96,33
301.0,116,29
302.0,97,45
303.0,83,39
304.0,99,49
...,...,...
895.0,103,36
896.0,84,42
897.0,96,44
898.0,85,45


In [39]:
import pandas as pd
import numpy as np


def woe_iv_all(data, target, bins=10):
    
    iv_summary = []
    woe_tables = {}

    for col in data.columns:
        
        if col == target:
            continue

        df = data[[col, target]].copy()

        # If numeric, bin
        if pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() > bins:
            try:
                df[col] = pd.qcut(df[col], q=bins, duplicates='drop')
            except:
                df[col] = df[col]

        # Group
        grouped = df.groupby(col)[target].agg(['count', 'sum'])
        grouped.columns = ['Total', 'Bad']

        grouped['Good'] = grouped['Total'] - grouped['Bad']

        # Distribution
        grouped['Dist_Good'] = grouped['Good'] / grouped['Good'].sum()
        grouped['Dist_Bad'] = grouped['Bad'] / grouped['Bad'].sum()

        # Replace 0 to avoid log error
        grouped['Dist_Good'] = grouped['Dist_Good'].replace(0, 0.0001)
        grouped['Dist_Bad'] = grouped['Dist_Bad'].replace(0, 0.0001)

        # WoE
        grouped['WoE'] = np.log(grouped['Dist_Good'] / grouped['Dist_Bad'])

        # IV
        grouped['IV'] = (grouped['Dist_Good'] - grouped['Dist_Bad']) * grouped['WoE']

        iv = grouped['IV'].sum()

        iv_summary.append({
            'Variable': col,
            'IV': iv
        })

        woe_tables[col] = grouped.reset_index()

    iv_summary = pd.DataFrame(iv_summary).sort_values(by='IV', ascending=False)

    return iv_summary, woe_tables


In [40]:
input_data.shape

(100000, 18)

In [41]:
# drop variables not needed

#input_data_txf = input_data2.drop(['30_DPD_F3M','BOUNCE_F3M','Approved_Flag','Credit_Score','PROSPECTID'], axis=1)
#input_data_txf = input_data2.drop([,'customer_id'], axis=1)
input_data_txf = input_data2.drop(['customer_id'], axis=1)

input_data_txf.head()

#create index before train test split to retain order later on
input_data_txf['index'] = input_data_txf.index


In [42]:
input_data_txf.shape

(100000, 30)

In [43]:
input_data2.head()

Unnamed: 0,customer_id,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,Default,gender_Male,...,application_channel_Paid,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed
0,C15796,30,35,414.0,59594,18152,28,74,0,0,...,1,0,0,0,0,0,1,0,0,0
1,C861,32,107578,618.0,43717,17401,64,375,0,1,...,0,1,1,0,0,0,1,0,0,0
2,C76821,25,117556,348.0,12459,49637,9,259,0,0,...,0,1,0,0,0,1,0,0,0,0
3,C54887,40,135867,881.0,35569,36527,6,348,0,0,...,0,0,0,0,0,1,0,0,1,0
4,C6266,18,131481,840.0,45459,57327,8,0,1,0,...,0,1,0,0,0,0,1,0,1,0


In [44]:
input_data2.shape

(100000, 30)

In [45]:
input_data_txf.head()

Unnamed: 0,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,Default,gender_Male,gender_Unknown,...,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed,index
0,30,35,414.0,59594,18152,28,74,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,32,107578,618.0,43717,17401,64,375,0,1,0,...,1,1,0,0,0,1,0,0,0,1
2,25,117556,348.0,12459,49637,9,259,0,0,1,...,1,0,0,0,1,0,0,0,0,2
3,40,135867,881.0,35569,36527,6,348,0,0,1,...,0,0,0,0,1,0,0,1,0,3
4,18,131481,840.0,45459,57327,8,0,1,0,0,...,1,0,0,0,0,1,0,1,0,4


In [46]:
# define features x and resopnse y
x = input_data_txf.drop(['Default'], axis=1)
y = input_data_txf['Default']



In [47]:
# split train and test data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1 )

In [48]:
x_train.head()

Unnamed: 0,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,gender_Male,gender_Unknown,city_chennai,...,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed,index
80630,72,96715,362.0,8449,53521,18,471,0,1,0,...,1,0,0,0,0,0,1,0,1,80630
62017,49,62283,323.0,57385,38115,40,148,0,1,0,...,0,0,0,1,1,0,0,0,0,62017
5005,21,90186,577.0,30476,51752,56,251,0,0,0,...,0,1,0,0,0,0,0,1,0,5005
56849,30,36674,600.0,29447,4721,24,270,0,0,1,...,1,1,0,0,1,0,0,0,0,56849
42434,68,58559,493.0,17875,24645,20,321,1,0,0,...,0,1,0,0,0,0,0,1,0,42434


In [49]:
x_test.shape


(30000, 29)

In [50]:
x_test.head()

Unnamed: 0,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,gender_Male,gender_Unknown,city_chennai,...,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed,index
43660,34,30420,308.0,14988,38106,42,0,0,0,0,...,0,0,0,1,0,1,0,1,0,43660
87278,14,33,456.0,41178,20544,35,66,0,0,1,...,0,0,0,1,0,0,0,0,1,87278
14317,26,139770,600.0,22075,25961,0,0,0,1,0,...,1,0,1,0,0,1,0,1,0,14317
81932,59,99701,517.0,58946,25996,30,392,1,0,0,...,0,0,1,0,1,0,0,1,0,81932
95321,67,29,492.0,56229,38822,20,284,0,1,1,...,0,0,1,0,1,0,0,0,1,95321


In [51]:
x_test.shape

(30000, 29)

In [52]:
# Import relevant packages

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score,recall_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [53]:
x_train

Unnamed: 0,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,gender_Male,gender_Unknown,city_chennai,...,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed,index
80630,72,96715,362.0,8449,53521,18,471,0,1,0,...,1,0,0,0,0,0,1,0,1,80630
62017,49,62283,323.0,57385,38115,40,148,0,1,0,...,0,0,0,1,1,0,0,0,0,62017
5005,21,90186,577.0,30476,51752,56,251,0,0,0,...,0,1,0,0,0,0,0,1,0,5005
56849,30,36674,600.0,29447,4721,24,270,0,0,1,...,1,1,0,0,1,0,0,0,0,56849
42434,68,58559,493.0,17875,24645,20,321,1,0,0,...,0,1,0,0,0,0,0,1,0,42434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,25,55,790.0,7648,29736,5,70,0,1,0,...,0,1,0,0,1,0,0,1,0,50057
98047,47,12,302.0,55816,21600,4,0,0,1,0,...,0,1,0,0,0,1,0,1,0,98047
5192,58,57336,807.0,28893,37388,64,299,0,1,0,...,0,0,1,0,0,0,1,0,1,5192
77708,46,66898,508.0,6644,14089,27,416,0,1,0,...,1,0,0,1,1,0,0,0,0,77708


In [54]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# Faster estimator
lr = LogisticRegression(
    max_iter=500,
    solver='liblinear',
    class_weight='balanced'
)

# RFECV (fast version)
rfecv = RFECV(
    estimator=lr,
    step=20,                 # remove 20 features at once
    cv=StratifiedKFold(2),   # only 2 folds
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

# Fit
rfecv.fit(x_train, y_train)

# Selected features
selected_features = x_train.columns[rfecv.support_]

print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", selected_features.tolist())

# Subset
X_train_sel = x_train[selected_features]
X_test_sel  = x_test[selected_features]


Fitting estimator with 29 features.
Optimal number of features: 9
Selected features: ['gender_Male', 'city_delhi', 'city_pune', 'employment_type_Salaried', 'application_channel_Paid', 'loan_purpose_Medical', 'loan_purpose_Other', 'loan_purpose_Travel', 'device_type_Unknown']


In [55]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Define estimator
rf = RandomForestClassifier(
    n_estimators=20,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

# RFECV setup
rfecv = RFECV(
    estimator=rf,
    step=5,
    cv=StratifiedKFold(3),
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1
)

# Fit on training set only
rfecv.fit(x_train, y_train)

# Best features
selected_features = x_train.columns[rfecv.support_]
print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", selected_features.tolist())

# Restrict to selected features
X_train_sel = x_train[selected_features]
X_test_sel  = x_test[selected_features]


Fitting estimator with 29 features.
Fitting estimator with 24 features.
Optimal number of features: 19
Selected features: ['age', 'income', 'credit_score', 'loan_amount', 'approved_amount', 'nb_txn_cnt', 'days_since_last_txn_credit_repayment_24month', 'gender_Male', 'gender_Unknown', 'city_pune', 'application_channel_Paid', 'application_channel_Referral', 'loan_purpose_Medical', 'loan_purpose_Other', 'loan_purpose_Travel', 'device_type_Other', 'device_type_Unknown', 'device_type_iOS', 'index']


In [56]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import pandas as pd

# Handle imbalance for XGB/LGBM
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6,
                             subsample=0.8, colsample_bytree=0.8,
                             scale_pos_weight=scale_pos_weight,
                             eval_metric="auc", random_state=42, n_jobs=-1)
}

# Evaluate
results = []

for name, model in models.items():
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)
    y_pred_prob = model.predict_proba(X_test_sel)[:,1]
    
    auc = roc_auc_score(y_test, y_pred_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    results.append([name, auc, precision, recall, f1, acc])

# Create results table
results_df = pd.DataFrame(results, columns=["Model","AUC","Precision","Recall","F1","Accuracy"])
results_df = results_df.sort_values(by="AUC", ascending=False)

print("\nModel Comparison:")
print(results_df)



Model Comparison:
                 Model       AUC  Precision    Recall        F1  Accuracy
2              XGBoost  0.500747   0.301141  0.388256  0.339194  0.544000
0  Logistic Regression  0.498931   0.299980  0.495300  0.373655  0.499467
1        Random Forest  0.496592   0.000000  0.000000  0.000000  0.698533


In [57]:
import pandas as pd

# Get trained XGBoost model
xgb_model = models["XGBoost"]

# Feature importance
xgb_importance = pd.DataFrame({
    "Feature": X_train_sel.columns,
    "Importance": xgb_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Top 15 XGBoost Features:")
print(xgb_importance.head(15))

Top 15 XGBoost Features:
                                         Feature  Importance
1                                         income    0.056743
4                                approved_amount    0.056308
6   days_since_last_txn_credit_repayment_24month    0.056114
3                                    loan_amount    0.055580
18                                         index    0.054844
2                                   credit_score    0.054770
5                                     nb_txn_cnt    0.054435
0                                            age    0.054050
7                                    gender_Male    0.052204
15                             device_type_Other    0.051855
14                           loan_purpose_Travel    0.051504
16                           device_type_Unknown    0.051484
11                  application_channel_Referral    0.050973
10                      application_channel_Paid    0.050826
12                          loan_purpose_Medical    0.050009

In [58]:
best_model = models["XGBoost"]

In [59]:
X_train_sel = x_train[selected_features]
X_test_sel  = x_test[selected_features]

In [60]:
x_test.head()

Unnamed: 0,age,income,credit_score,loan_amount,approved_amount,nb_txn_cnt,days_since_last_txn_credit_repayment_24month,gender_Male,gender_Unknown,city_chennai,...,application_channel_Referral,loan_purpose_Medical,loan_purpose_Other,loan_purpose_Travel,device_type_Other,device_type_Unknown,device_type_iOS,text_complaint_Salaried,text_complaint_Self-Employed,index
43660,34,30420,308.0,14988,38106,42,0,0,0,0,...,0,0,0,1,0,1,0,1,0,43660
87278,14,33,456.0,41178,20544,35,66,0,0,1,...,0,0,0,1,0,0,0,0,1,87278
14317,26,139770,600.0,22075,25961,0,0,0,1,0,...,1,0,1,0,0,1,0,1,0,14317
81932,59,99701,517.0,58946,25996,30,392,1,0,0,...,0,0,1,0,1,0,0,1,0,81932
95321,67,29,492.0,56229,38822,20,284,0,1,1,...,0,0,1,0,1,0,0,0,1,95321


In [61]:
X_train_sel = x_train[selected_features]
X_test_sel  = x_test[selected_features]

In [62]:
# Class prediction (0/1)
y_test_pred = best_model.predict(X_test_sel)

# Probability prediction
y_test_prob = best_model.predict_proba(X_test_sel)[:, 1]


In [63]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print("AUC:", roc_auc_score(y_test, y_test_prob))

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


AUC: 0.500746754262166

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.61      0.65     20957
           1       0.30      0.39      0.34      9043

    accuracy                           0.54     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.58      0.54      0.56     30000


Confusion Matrix:
[[12809  8148]
 [ 5532  3511]]
