In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,auc,roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier
from pandas import DataFrame
import pandas as pd
from typing import Optional

In [3]:
def load_data_to_dta_frame(filepath:str)->Optional[DataFrame]:
  '''
  Load a csv file to a pandas data frame
  Args:
      filepath: str: path to the csv file
  Returns:
       DataFrame: a pandas data frame
  '''
  try:
    return pd.read_csv(filepath)
  except FileNotFoundError:
    print(f'file not found at {filepath}')
    return None
df = load_data_to_dta_frame('../data/processed/credit_risk_dataset.csv')


In [5]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,3.135494,10.98531,rent,4.820282,personal,d,10.463132,1.625226,1,0.463734,y,1.386294
1,3.091042,9.169623,own,1.791759,education,b,6.908755,0.041636,0,0.09531,n,1.098612
2,3.258097,9.169623,mortgage,0.693147,medical,c,8.612685,0.603032,1,0.451076,n,1.386294
3,3.178054,11.089821,rent,1.609438,medical,c,10.463132,1.368866,1,0.425268,n,1.098612
4,3.218876,10.904138,rent,2.197225,medical,c,10.463132,1.05734,1,0.438255,y,1.609438


In [6]:
df_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full, test_size=0.25, random_state=42)

In [8]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [9]:
y_train = df_train['loan_status']
y_val = df_val['loan_status']
y_test = df_test['loan_status']

In [10]:
del df_train['loan_status']
del df_val['loan_status']
del df_test['loan_status']

In [11]:
df_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,3.178054,10.626121,mortgage,2.079442,education,b,9.680406,0.265545,0.329304,n,1.098612
1,3.218876,11.264477,mortgage,0.0,debtconsolidation,d,9.878221,1.394827,0.223144,n,1.609438
2,3.89182,11.170506,rent,1.791759,personal,b,9.680406,-0.571681,0.207014,n,2.890372
3,3.258097,10.491302,mortgage,1.098612,debtconsolidation,c,9.528867,0.716609,0.322083,n,1.609438
4,3.332205,10.942014,rent,1.098612,homeimprovement,d,9.546884,0.927538,0.223144,n,2.197225


In [12]:
train_dict = df_train.to_dict(orient='records')

In [13]:
dv = DictVectorizer(sparse=False)

In [16]:
X_train = dv.fit_transform(train_dict)

In [17]:
dv.get_feature_names_out()

array(['cb_person_cred_hist_length', 'cb_person_default_on_file=n',
       'cb_person_default_on_file=y', 'loan_amnt', 'loan_grade=a',
       'loan_grade=b', 'loan_grade=c', 'loan_grade=d', 'loan_grade=e',
       'loan_grade=f', 'loan_grade=g', 'loan_int_rate',
       'loan_intent=debtconsolidation', 'loan_intent=education',
       'loan_intent=homeimprovement', 'loan_intent=medical',
       'loan_intent=personal', 'loan_intent=venture',
       'loan_percent_income', 'person_age', 'person_emp_length',
       'person_home_ownership=mortgage', 'person_home_ownership=other',
       'person_home_ownership=own', 'person_home_ownership=rent',
       'person_income'], dtype=object)

In [18]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=10,min_samples_leaf=1,min_samples_split=15)
dt.fit(X_train, y_train)

In [19]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [20]:
y_pred = dt.predict_proba(X_val)

In [21]:
roc_auc_score(y_val, y_pred[:,1])

0.9126487057864963

In [22]:
y_pred = dt.predict_proba(X_train)
roc_auc_score(y_train, y_pred[:,1])

0.9343041188375135

In [32]:
print(f"Training set class distribution: {y_train.value_counts()}")
print(f"Test set class distribution: {y_test.value_counts()}")

Training set class distribution: loan_status
0    15297
1     4251
Name: count, dtype: int64
Test set class distribution: loan_status
0    5072
1    1445
Name: count, dtype: int64


In [113]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- loan_percent_income <= 0.27
|   |--- loan_int_rate <= 0.96
|   |   |--- person_income <= 9.90
|   |   |   |--- loan_percent_income <= 0.14
|   |   |   |   |--- person_home_ownership=own <= 0.50
|   |   |   |   |   |--- person_emp_length <= 0.35
|   |   |   |   |   |   |--- loan_int_rate <= 0.62
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- loan_int_rate >  0.62
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- person_emp_length >  0.35
|   |   |   |   |   |   |--- loan_intent=homeimprovement <= 0.50
|   |   |   |   |   |   |   |--- person_income <= 9.43
|   |   |   |   |   |   |   |   |--- loan_int_rate <= -1.08
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- loan_int_rate >  -1.08
|   |   |   |   |   |   |   |   |   |--- person_income <= 9.08
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- person_income >  9.08
|   |   |   |   |   |   |   |   |   |   |--

In [None]:

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10,15, None],
    'min_samples_split': [2, 5, 10,15],
    'min_samples_leaf': [1, 2, 4,5],
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

# Define the GridSearch
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END class_weight=None, criterion=gini, max_depth=3, max_features=None, min_samples_leaf=1, min_samples_split=2; tot