In [35]:
import pandas as pd
import polars as pl
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier,Pool
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df=pd.read_csv('loan_data.csv')

In [6]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


# doctorates merged with Masters

In [7]:
df_processed=df.copy()
df_processed["person_education_processed"]=None

df_processed.loc[df_processed['person_education']=="Doctorate","person_education_processed"]="Masters"
df_processed.loc[df_processed['person_education_processed'].isnull(),"person_education_processed"]=df_processed.loc[df_processed['person_education_processed'].isnull(),"person_education"]

In [10]:
df_processed=df_processed[df_processed["person_home_ownership"]!="Other"]

In [11]:
X=df_processed.drop(columns=['loan_status'],axis=1)

le=LabelEncoder()
y=le.fit_transform(df_processed['loan_status'])

In [12]:
cat_columns= X.select_dtypes(include=['object']).columns.tolist()
num_columns = X.select_dtypes(exclude=['object']).columns.tolist()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [14]:
train_pool=Pool(X_train,y_train,cat_features=cat_columns)

In [None]:
catboost_model= CatBoostClassifier(verbose=25,thread_count=4)

param_grid = {
    'depth': [3,4,6,5],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100,200]
}

In [16]:
catboost_model.grid_search(param_grid,X=train_pool,cv=5, stratified=True)

0:	learn: 0.6689324	test: 0.6692303	best: 0.6692303 (0)	total: 182ms	remaining: 18s
2:	learn: 0.6246776	test: 0.6255764	best: 0.6255764 (2)	total: 228ms	remaining: 7.38s
4:	learn: 0.5853202	test: 0.5867014	best: 0.5867014 (4)	total: 289ms	remaining: 5.48s
6:	learn: 0.5503976	test: 0.5522074	best: 0.5522074 (6)	total: 338ms	remaining: 4.5s
8:	learn: 0.5187401	test: 0.5208808	best: 0.5208808 (8)	total: 383ms	remaining: 3.88s
10:	learn: 0.4908871	test: 0.4933039	best: 0.4933039 (10)	total: 434ms	remaining: 3.52s
12:	learn: 0.4664269	test: 0.4690975	best: 0.4690975 (12)	total: 480ms	remaining: 3.21s
14:	learn: 0.4447168	test: 0.4475786	best: 0.4475786 (14)	total: 523ms	remaining: 2.96s
16:	learn: 0.4254522	test: 0.4284445	best: 0.4284445 (16)	total: 565ms	remaining: 2.76s
18:	learn: 0.4084716	test: 0.4115614	best: 0.4115614 (18)	total: 615ms	remaining: 2.62s
20:	learn: 0.3982687	test: 0.4013755	best: 0.4013755 (20)	total: 680ms	remaining: 2.56s
22:	learn: 0.3844105	test: 0.3875652	best: 0.

{'params': {'depth': 6, 'learning_rate': 0.1, 'iterations': 200},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
    

In [17]:
y_pred=catboost_model.predict(X_test)

In [19]:
df_test=X_test.copy()

In [20]:
df_test["loan_status"] = y_test
df_test["predicted_loan_status"] = le.inverse_transform(y_pred)

In [29]:
print(f"Accuracy: {round(sum(y_test==y_pred)/len(y_test),3)}")

Accuracy: 0.928


In [33]:
feature_importance=catboost_model.feature_importances_
feature_importance.__len__()

14

In [36]:
catboost_model.feature_names_

['person_age',
 'person_gender',
 'person_education',
 'person_income',
 'person_emp_exp',
 'person_home_ownership',
 'loan_amnt',
 'loan_intent',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length',
 'credit_score',
 'previous_loan_defaults_on_file',
 'person_education_processed']

In [40]:
feature_importance=[round(x,4) for x in feature_importance]

In [41]:
list(zip(catboost_model.feature_names_,feature_importance))

[('person_age', np.float64(0.5031)),
 ('person_gender', np.float64(0.0096)),
 ('person_education', np.float64(0.0478)),
 ('person_income', np.float64(5.8314)),
 ('person_emp_exp', np.float64(0.158)),
 ('person_home_ownership', np.float64(4.4745)),
 ('loan_amnt', np.float64(0.8721)),
 ('loan_intent', np.float64(2.592)),
 ('loan_int_rate', np.float64(4.7766)),
 ('loan_percent_income', np.float64(10.8608)),
 ('cb_person_cred_hist_length', np.float64(0.2871)),
 ('credit_score', np.float64(1.6574)),
 ('previous_loan_defaults_on_file', np.float64(67.8485)),
 ('person_education_processed', np.float64(0.0812))]

In [46]:
import plotly.express as px

In [47]:
px.bar(x=catboost_model.feature_names_,y=feature_importance,title="Feature Importance",labels={"x":"Features","y":"Importance"})