This is the model training for a Descision Tree Classifier, for a Loan approval

In [1]:
#Import all needed imports

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
import pickle
from sklearn.linear_model import LogisticRegression

### Step 1 Data Preprocessing

In [None]:
#set file path and load with pandas
file_path = 'loan_approval_dataset.csv'
loan_ds = pd.read_csv(file_path)
loan_ds.head(10)

In [None]:
#check if we have missing values
missing_values = loan_ds.isnull().sum()
print("Columns with missing values:\n", missing_values[missing_values > 0])
#remove whitespace from cells
loan_ds.columns = loan_ds.columns.str.strip()
print(loan_ds.dtypes)

In [None]:
# Convert to Binary Values
loan_ds['loan_status'] = loan_ds['loan_status'].apply(lambda x: 1 if x.strip() == 'Approved' else 0)
loan_ds['education'] = loan_ds['education'].apply(lambda x: 1 if x.strip() == 'Graduate' else 0)
loan_ds['self_employed'] = loan_ds['self_employed'].apply(lambda x: 1 if x.strip() == 'Yes' else 0)
print(loan_ds.dtypes)

In [5]:
# Convert to numeric values
loan_ds['no_of_dependents'] = pd.to_numeric(loan_ds['no_of_dependents'], errors='coerce')
loan_ds['income_annum'] = pd.to_numeric(loan_ds['income_annum'], errors='coerce')
loan_ds['loan_amount'] = pd.to_numeric(loan_ds['loan_amount'], errors='coerce')
loan_ds['cibil_score'] = pd.to_numeric(loan_ds['cibil_score'], errors='coerce')

### Step 3 Model Development

In [None]:
# Set Features and expected ouput with X and y respectively
X = loan_ds.drop('loan_status', axis=1)
y = loan_ds['loan_status']
print("Features shape:", X.shape)
print("\nFeature columns:", X.columns.tolist())

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
#create model object and fit it to data
model = DecisionTreeClassifier(random_state=42)
trained_model_tree = model.fit(X_train, y_train)

#2nd model Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
trained_model_forest = rf.fit(X_train, y_train)

#3rd model Logisitic Regression
lr = LogisticRegression(random_state=42)
trained_model_logistic_regression = lr.fit(X_train, y_train)

#4th model KNN
knn = KNeighborsClassifier(n_neighbors=33)
trained_knn = knn.fit(X_train, y_train)

### Step 4 Model Evaluation

In [None]:
y_pred_tree = trained_model_tree.predict(X_test)
y_pred_proba_tree = trained_model_tree.predict_proba(X_test)[:, 1]

y_pred_forest = trained_model_forest.predict(X_test)
y_pred_proba = trained_model_forest.predict_proba(X_test)[:, 1]

y_pred_logistic_regression = trained_model_logistic_regression.predict(X_test)
y_pred_proba_logistic_regression = trained_model_forest.predict_proba(X_test)[:, 1]

y_pred_knn = trained_knn.predict(X_test)
y_pred_proba_knn = trained_knn.predict_proba(X_test)[:, 1]

# Calculate metrics for tree
def metrics(y_pred, y_pred_proba, classifier_name):
    print("Metrics for " + classifier_name + ":")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}\n")

metrics(y_pred_tree, y_pred_proba_tree, "Decision Tree")
metrics(y_pred_forest, y_pred_proba, "Random Forest")
metrics(y_pred_logistic_regression, y_pred_proba_logistic_regression, "Logisitic Regression")
metrics(y_pred_knn, y_pred_proba_knn, "K Nearest Neighbors")

### Top 8 Features

In [None]:
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
feature_importances.sort_values('importance', ascending=False, inplace=True)
top_eight_features = feature_importances.tail(14)
print(top_eight_features)

### Retrain/test with top 8 features

In [None]:
# NOTE: We will use no_of_dependents because can't ask users for loan id
X = loan_ds.drop(['loan_status', 'loan_id', 'bank_asset_value', 'education', 'self_employed'], axis=1)
y = loan_ds['loan_status'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#1st model DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
trained_model_tree = model.fit(X_train, y_train)

#2nd model Random Forest Classifier
rf_final = RandomForestClassifier(random_state=42)
trained_model_forest_final = rf.fit(X_train, y_train)

#3rd model Logisitic Regression
lr = LogisticRegression(random_state=42)
trained_model_logistic_regression = lr.fit(X_train, y_train)

#4th model KNN
# Played around with hyperparameter and found 33 to do OK
knn = KNeighborsClassifier(n_neighbors=33)
trained_knn = knn.fit(X_train, y_train)

metrics(y_pred_tree, y_pred_proba_tree, "Decision Tree")
metrics(y_pred_forest, y_pred_proba, "Random Forest")
metrics(y_pred_logistic_regression, y_pred_proba_logistic_regression, "Logisitic Regression")
metrics(y_pred_knn, y_pred_proba_knn, "K Nearest Neighbors")

In [None]:
# Plot ROC curve
plt.figure(figsize=(8, 6))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_pred_proba):.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
print(X.head())
ml_model = 'forest.sav'
pickle.dump(trained_model_forest_final,open(ml_model,'wb'))