# Final Decision Tree Models

In [12]:
from preprocessing import get_preprocessed_df
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from xgboost import XGBClassifier
import multiprocessing as mp
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [13]:
def get_results(holdout, model, chosen_features, y_test, y_pred):
    holdout_true = holdout[' loan_status']
    # holdout.drop(columns=[' loan_status'], inplace=True)
    holdout_pred = model.predict(holdout[chosen_features])
    val_acc = accuracy_score(y_test, y_pred)
    holdout_f1 = f1_score(holdout_true, holdout_pred)
    holdout_acc = accuracy_score(holdout_true, holdout_pred)
    return val_acc, holdout_f1, holdout_acc



# Vanilla Tree

In [14]:
def vanilla_tree():
    x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()
    model = DecisionTreeClassifier(random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    holdout_true = holdout[' loan_status']
    holdout.drop(columns=[' loan_status'], inplace=True)
    holdout_pred = model.predict(holdout)
    
    print('------------------------------------ Without Cibil Score ------------------------------------')

    print("F1:", f1_score(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Holdout F1:", f1_score(holdout_true, holdout_pred))
    print("Holdout Accuracy:", accuracy_score(holdout_true, holdout_pred))

    x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
    model = DecisionTreeClassifier(random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    holdout_true = holdout[' loan_status']
    holdout.drop(columns=[' loan_status'], inplace=True)
    holdout_pred = model.predict(holdout)

    print('------------------------------------ With Cibil Score ------------------------------------')
    print(f'holdout f1: {f1_score(holdout_true, holdout_pred)}')
    print(f'holdout acc: {accuracy_score(holdout_true, holdout_pred)}')


vanilla_tree()

------------------------------------ Without Cibil Score ------------------------------------
F1: 0.7667870036101083
Accuracy: 0.6217798594847775
Holdout F1: 0.76410998552822
Holdout Accuracy: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# Decision Tree Classifier

In [15]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

features = [' col_times_term',
     ' loan_term',
     ' lux_times_res',
     ' self_employed_ No',
     ' no_of_dependents',
     ' income_annum',
     ' loan_amount',
     ' self_employed_ Yes',
     ' education_ Graduate',
     ' term_times_income',
     ' residential_assets_value',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']
number_of_features = len(features)



hyperparameters = {
    'max_depth': 27,
    'min_samples_split': 13,
    'min_samples_leaf': 5,
    'max_features': None,
    'criterion': 'log_loss',
    'random_state': 42,
    'class_weight': None,
    'splitter': 'best',
    'max_leaf_nodes': 52,
    'min_impurity_decrease': 0.0015640478078397374,
    'min_weight_fraction_leaf': 0.001673957360422289,
}

model = DecisionTreeClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)


print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = DecisionTreeClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7667870036101083
holdout f1: 0.76410998552822
val acc: 0.6217798594847775
holdout acc: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# eXtreme Gradient Boosting Classifier

In [16]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

features = [' no_of_dependents',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']
number_of_features = len(features)



hyperparameters = {
    'max_depth': 4,
    'random_state': 42,
    'min_split_loss': 3,
    'min_child_weight': 14,
    'subsample': 0.868207131636942,
    'reg_lambda': 3,
    'reg_alpha': 2,
    'learning_rate': 0.04797473982907923
}

model = XGBClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)


print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = XGBClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7667870036101083
holdout f1: 0.76410998552822
val acc: 0.6217798594847775
holdout acc: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# Gaussian Naive Bayes

In [17]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

features = [' no_of_dependents',
     ' loan_amount',
     ' self_employed_ No',
     ' income_annum',
     ' self_employed_ Yes',
     ' residential_assets_value',
     ' term_times_income',
     ' education_ Not Graduate',
     ' education_ Graduate',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']
number_of_features = len(features)

model = GaussianNB()
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)


print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = GaussianNB()
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7667870036101083
holdout f1: 0.76410998552822
val acc: 0.6217798594847775
holdout acc: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# Random Forest Classifier

In [18]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

features = [' total_collateral',
     ' col_times_term',
     ' education_ Not Graduate',
     ' no_of_dependents',
     ' education_ Graduate',
     ' term_times_income',
     ' lux_times_res',
     ' loan_amount',
     ' loan_term',
     ' self_employed_ Yes',
     ' self_employed_ No',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']
number_of_features = len(features)


hyperparameters = {
    'max_depth': 26,
    'random_state': 42,
    'min_samples_split': 7,
    'min_samples_leaf': 5,
    'bootstrap': True,
    'warm_start': False,
    'min_weight_fraction_leaf': 0.02223794874814192,
    'n_estimators': 220,
    'criterion': 'entropy'
}

model = RandomForestClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)


print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = RandomForestClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7667870036101083
holdout f1: 0.76410998552822
val acc: 0.6217798594847775
holdout acc: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# KNeighbors Classifier

In [19]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

features = [' no_of_dependents',
     ' education_ Not Graduate',
     ' luxury_assets_value',
     ' loan_coll_ratio',
     ' loan_income_ratio',
     ' commercial_assets_value',
     ' bank_asset_value']
number_of_features = len(features)


hyperparameters = {
    'weights': 'uniform',
    'n_neighbors': 31,
    'p': 2,
    'algorithm': 'auto'
}

model = KNeighborsClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)


print('------------------------------------ Without Cibil Score ------------------------------------')
print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')
print(f'val acc: {val_acc}')
print(f'holdout acc: {holdout_acc}')

x_train, x_test, y_train, y_test, holdout = get_preprocessed_df(with_cibil=True)
features.append(' cibil_score')
model = KNeighborsClassifier(**hyperparameters)
model.fit(x_train[features], y_train)
y_pred = model.predict(x_test[features])
f1 = f1_score(y_test, y_pred)
val_acc, holdout_f1, holdout_acc = get_results(holdout, model, features, y_test, y_pred)

print('------------------------------------ With Cibil Score ------------------------------------')
print(f'holdout f1: {holdout_f1}')
print(f'holdout acc: {holdout_acc}')

------------------------------------ Without Cibil Score ------------------------------------
val f1: 0.7667870036101083
holdout f1: 0.76410998552822
val acc: 0.6217798594847775
holdout acc: 0.6182669789227166
------------------------------------ With Cibil Score ------------------------------------
holdout f1: 0.76410998552822
holdout acc: 0.6182669789227166


# Model Outcomes Using GridSearch 

## Decision Tree Classifier

In [20]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

hyperparameters = {'criterion': 'entropy', 
    'max_depth': 4, 
    'max_leaf_nodes': 37, 
    'min_impurity_decrease': 0.006, 
    'min_samples_leaf': 2, 
    'min_samples_split': 8, 
    'min_weight_fraction_leaf': 0}

model = DecisionTreeClassifier(**hyperparameters)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
holdout_true = holdout[' loan_status']
holdout.drop(columns=[' loan_status'], inplace=True)
holdout_pred = model.predict(holdout)
holdout_f1 = f1_score(holdout_true, holdout_pred)

print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')

val f1: 0.7667870036101083
holdout f1: 0.76410998552822


## XGradient Boosting Classifier

In [21]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

parameters = {'learning_rate': 0.001, 
    'max_depth': 4, 
    'min_child_weight': 3, 
    'min_split_loss': 6, 
    'reg_alpha': 2, 
    'reg_lambda': 3, 
    'random_state': 42,
    'subsample': 0.9}

xgb = XGBClassifier(**parameters)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
holdout_true = holdout[' loan_status']
holdout.drop(columns=[' loan_status'], inplace=True)
holdout_pred = model.predict(holdout)
holdout_f1 = f1_score(holdout_true, holdout_pred)

print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')

val f1: 0.7667870036101083
holdout f1: 0.76410998552822


## Gaussian NB

In [22]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

parameters = {'var_smoothing': 1.0}

model = GaussianNB(**parameters)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
holdout_true = holdout[' loan_status']
holdout.drop(columns=[' loan_status'], inplace=True)
holdout_pred = model.predict(holdout)
holdout_f1 = f1_score(holdout_true, holdout_pred)

print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')

val f1: 0.7667870036101083
holdout f1: 0.76410998552822


## Random Forest

In [23]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

hyperparameters = {'bootstrap': False, 
    'criterion': 'log_loss', 
    'max_depth': 20, 
    'min_samples_leaf': 2, 
    'min_samples_split': 4, 
    'min_weight_fraction_leaf': 0.05, 
    'n_estimators': 250, 
    'warm_start': False}

model = RandomForestClassifier(**hyperparameters)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
holdout_true = holdout[' loan_status']
holdout.drop(columns=[' loan_status'], inplace=True)
holdout_pred = model.predict(holdout)
holdout_f1 = f1_score(holdout_true, holdout_pred)

print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')




val f1: 0.7667870036101083
holdout f1: 0.76410998552822


## KNeighbors

In [24]:
x_train, x_test, y_train, y_test, holdout = get_preprocessed_df()

parameters = {'algorithm': 'auto', 
    'n_neighbors': 31, 
    'p': 2, 
    'weights': 'uniform'}

model = KNeighborsClassifier(**parameters)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
f1 = f1_score(y_test, y_pred)
holdout_true = holdout[' loan_status']
holdout.drop(columns=[' loan_status'], inplace=True)
holdout_pred = model.predict(holdout)
holdout_f1 = f1_score(holdout_true, holdout_pred)

print(f'val f1: {f1}')
print(f'holdout f1: {holdout_f1}')

val f1: 0.7667870036101083
holdout f1: 0.76410998552822
