
# Part C: Model Building & Evaluation (Baseline Models)

This section builds and evaluates baseline machine learning models for predicting loan default using the preprocessed dataset. We will use Logistic Regression and Decision Tree Classifier.


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Load preprocessed data
df = pd.read_csv("loan_data_preprocessed.csv")  # Assumes file from previous steps
X = df.drop("not.fully.paid", axis=1)
y = df["not.fully.paid"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

# Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
tree_preds = tree_model.predict(X_test)


#Measuring model Performance

In [None]:
# Import tools to help us measure how well the model did
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt  # used for showing the chart
import seaborn as sns  # makes our chart look nicer

# Creating a simple function to check how good the model is
def evaluate_model_simple(actual_labels, predicted_labels, model_name):
    # Print the model name to know which one we're looking at
    print(f"\nHow did {model_name} perform?")

    # Prints out the basic scores: Accuracy, Precision, Recall, F1
    print("Here's the performance report:")
    print(classification_report(actual_labels, predicted_labels))

    # Create the confusion matrix (it shows correct vs wrong predictions)
    matrix = confusion_matrix(actual_labels, predicted_labels)

    # Use seaborn to draw the matrix in a nice chart
    sns.heatmap(matrix, annot=True, cmap="Blues", fmt='d')  # fmt='d' makes numbers whole
    plt.title(f"{model_name} Confusion Matrix")  # chart title
    plt.xlabel("Predicted Labels")  # x-axis: what the model guessed
    plt.ylabel("Actual Labels")     # y-axis: what it should have guessed
    plt.show()  # show the chart

# We test this function using the model predictions
# log_preds = predictions from Logistic Regression
# tree_preds = predictions from Decision Tree
evaluate_model_simple(y_test, log_preds, "Logistic Regression")
evaluate_model_simple(y_test, tree_preds, "Decision Tree")



### Model Comparison

Based on the evaluation metrics above, compare the Logistic Regression and Decision Tree performance. Consider F1 Score and Recall for unbalanced classes, and use these insights to select the better model for further improvement.



### PART D:Advanced Models & Feature Importance



In [None]:
# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Train a Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_preds = gb_model.predict(X_test)
