In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,auc,roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier
from pandas import DataFrame
import pandas as pd
from typing import Optional

In [11]:
def load_data_to_dta_frame(filepath:str)->Optional[DataFrame]:
  '''
  Load a csv file to a pandas data frame
  Args:
      filepath: str: path to the csv file
  Returns:
       DataFrame: a pandas data frame
  '''
  try:
    return pd.read_csv(filepath)
  except FileNotFoundError:
    print(f'file not found at {filepath}')
    return None
df = load_data_to_dta_frame('../data/processed/credit_risk_dataset.csv')


In [12]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,3.135494,10.98531,1,4.820282,4,4,10.463132,1.625226,1,0.463734,1,1.386294
1,3.091042,9.169623,3,1.791759,1,2,6.908755,0.041636,0,0.09531,0,1.098612
2,3.258097,9.169623,2,0.693147,2,3,8.612685,0.603032,1,0.451076,0,1.386294
3,3.178054,11.089821,1,1.609438,2,3,10.463132,1.368866,1,0.425268,0,1.098612
4,3.218876,10.904138,1,2.197225,2,3,10.463132,1.05734,1,0.438255,1,1.609438


In [13]:
df_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full, test_size=0.25, random_state=42)

In [14]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [15]:
y_train = df_train['loan_status']
y_val = df_val['loan_status']
y_test = df_test['loan_status']

In [16]:
del df_train['loan_status']
del df_val['loan_status']
del df_test['loan_status']

In [20]:
df_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,3.178054,10.626121,2,2.079442,1,2,9.680406,0.265545,0.329304,0,1.098612
1,3.218876,11.264477,2,0.0,5,4,9.878221,1.394827,0.223144,0,1.609438
2,3.89182,11.170506,1,1.791759,4,2,9.680406,-0.571681,0.207014,0,2.890372
3,3.258097,10.491302,2,1.098612,5,3,9.528867,0.716609,0.322083,0,1.609438
4,3.332205,10.942014,1,1.098612,6,4,9.546884,0.927538,0.223144,0,2.197225


In [22]:
train_dict = df_train.to_dict(orient='records')

In [23]:
dv = DictVectorizer(sparse=False)

In [24]:
X_train = dv.fit_transform(train_dict)

In [25]:
dv.get_feature_names_out()

array(['cb_person_cred_hist_length', 'cb_person_default_on_file',
       'loan_amnt', 'loan_grade', 'loan_int_rate', 'loan_intent',
       'loan_percent_income', 'person_age', 'person_emp_length',
       'person_home_ownership', 'person_income'], dtype=object)

In [26]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [34]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [35]:
y_pred = dt.predict_proba(X_val)

In [37]:
roc_auc_score(y_val, y_pred[:,1])

0.8563187401760104

In [42]:
y_pred = dt.predict_proba(X_train)
roc_auc_score(y_train, y_pred[:,1])

0.853577476942195

In [41]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- loan_percent_income <= 0.27
|   |--- loan_grade <= 3.50
|   |   |--- person_income <= 9.90
|   |   |   |--- class: 1
|   |   |--- person_income >  9.90
|   |   |   |--- class: 0
|   |--- loan_grade >  3.50
|   |   |--- person_emp_length <= 1.24
|   |   |   |--- class: 1
|   |   |--- person_emp_length >  1.24
|   |   |   |--- class: 0
|--- loan_percent_income >  0.27
|   |--- person_home_ownership <= 1.50
|   |   |--- class: 1
|   |--- person_home_ownership >  1.50
|   |   |--- person_income <= 9.90
|   |   |   |--- class: 1
|   |   |--- person_income >  9.90
|   |   |   |--- class: 0

