# Final Decision Tree Models

In [None]:
from preprocessing import get_preprocessed_df
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import random
import winsound
import time
from xgboost import XGBClassifier
import multiprocessing as mp
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def get_results(holdout, model, chosen_features, y_test, y_pred):
    holdout_true = holdout[' loan_status']
    holdout.drop(columns=[' loan_status'], inplace=True)
    holdout_pred = model.predict(holdout[chosen_features])
    val_acc = accuracy_score(y_test, y_pred)
    holdout_f1 = f1_score(holdout_true, holdout_pred)
    holdout_acc = accuracy_score(holdout_true, holdout_pred)
    return val_acc, holdout_f1, holdout_acc



# Random Forest Classifier

In [6]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

number_of_features = 9
features = [' no_of_dependents_5',
     ' lux_times_res',
     ' education_ Not Graduate',
     ' income_annum',
     ' self_employed_ No',
     ' loan_term',
     ' luxury_assets_value',
     ' total_collateral',
     ' col_times_term',
     ' self_employed_ Yes',
     ' loan_amount',
     ' no_of_dependents_0',
     ' no_of_dependents_1',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']



hyperparameters = {
    "max_depth": 26,
    "random_state": 42,
    "min_samples_split": 4,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "warm_start": False,
    "min_weight_fraction_leaf": 0.02936400746539998,
    "n_estimators": 284,
    'criterion': 'log_loss',
    'n_jobs': -1
}

model = RandomForestClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = RandomForestClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7759245830311821
holdout f1: 0.7652173913043477
val acc: 0.6381733021077284
holdout acc: 0.6206088992974239
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.9962121212121212
holdout acc: 0.9953161592505855
