In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,auc,roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier
from pandas import DataFrame
import pandas as pd
from typing import Optional

In [21]:
def load_data_to_dta_frame(filepath:str)->Optional[DataFrame]:
  '''
  Load a csv file to a pandas data frame
  Args:
      filepath: str: path to the csv file
  Returns:
       DataFrame: a pandas data frame
  '''
  try:
    return pd.read_csv(filepath)
  except FileNotFoundError:
    print(f'file not found at {filepath}')
    return None
df = load_data_to_dta_frame('../data/processed/credit_risk_dataset.csv')


In [22]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,3.135494,10.98531,rent,4.820282,personal,d,10.463132,1.625226,1,0.463734,y,1.386294
1,3.091042,9.169623,own,1.791759,education,b,6.908755,0.041636,0,0.09531,n,1.098612
2,3.258097,9.169623,mortgage,0.693147,medical,c,8.612685,0.603032,1,0.451076,n,1.386294
3,3.178054,11.089821,rent,1.609438,medical,c,10.463132,1.368866,1,0.425268,n,1.098612
4,3.218876,10.904138,rent,2.197225,medical,c,10.463132,1.05734,1,0.438255,y,1.609438


In [23]:
df = pd.get_dummies(df)

In [24]:
df_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full, test_size=0.25, random_state=42)

In [25]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [26]:
y_train = df_train['loan_status']
y_val = df_val['loan_status']
y_test = df_test['loan_status']

In [27]:
del df_train['loan_status']
del df_val['loan_status']
del df_test['loan_status']

In [28]:
df_train.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_mortgage,person_home_ownership_other,person_home_ownership_own,...,loan_intent_venture,loan_grade_a,loan_grade_b,loan_grade_c,loan_grade_d,loan_grade_e,loan_grade_f,loan_grade_g,cb_person_default_on_file_n,cb_person_default_on_file_y
0,3.178054,10.626121,2.079442,9.680406,0.265545,0.329304,1.098612,True,False,False,...,False,False,True,False,False,False,False,False,True,False
1,3.218876,11.264477,0.0,9.878221,1.394827,0.223144,1.609438,True,False,False,...,False,False,False,False,True,False,False,False,True,False
2,3.89182,11.170506,1.791759,9.680406,-0.571681,0.207014,2.890372,False,False,False,...,False,False,True,False,False,False,False,False,True,False
3,3.258097,10.491302,1.098612,9.528867,0.716609,0.322083,1.609438,True,False,False,...,False,False,False,True,False,False,False,False,True,False
4,3.332205,10.942014,1.098612,9.546884,0.927538,0.223144,2.197225,False,False,False,...,False,False,False,False,True,False,False,False,True,False


In [30]:
df_val.shape, df_train.shape, df_test.shape

((6516, 26), (19548, 26), (6517, 26))

In [35]:
## Random Forest Classifier
model3 = RandomForestClassifier()
model3.fit(df_train,y_train)
y_pred_model3 = model3.predict(df_val)
accuracy = accuracy_score(y_val,y_pred_model3)
print("Accuracy score of Random Forest: ", accuracy)

Accuracy score of Random Forest:  0.9352363413136894


In [39]:
#KNearestNeighbors model
model4 = KNeighborsClassifier(n_neighbors=5)
model4.fit(df_train,y_train)
y_pred_model4 = model4.predict(df_val)
accuracy = accuracy_score(y_val,y_pred_model4)
print("Accuracy score of KNeighbors: ", accuracy)

Accuracy score of KNeighbors:  0.8729281767955801
