# DATA PRE-PROCESSING

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

In [None]:
# Load the dataset
file_path = '/content/Loan_default.csv'  # Adjust path if needed
loan_data = pd.read_csv(file_path)

In [None]:
# Drop unique identifier
loan_data = loan_data.drop('LoanID', axis=1)

In [None]:
# Encode categorical features using one-hot encoding or label encoding
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus',
                    'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

In [None]:
# Use one-hot encoding for categorical variables
loan_data = pd.get_dummies(loan_data, columns=categorical_cols, drop_first=True)

In [None]:
# Split features and target
X = loan_data.drop('Default', axis=1)
y = loan_data['Default']

In [None]:
# Address class imbalance in target variable using SMOTE
sm = SMOTE(random_state=42)

# Before applying SMOTE, handle NaN values in 'y'
# Remove rows with NaN in 'Default' column
loan_data = loan_data.dropna(subset=['Default'])

# Split features and target AFTER handling NaNs
X = loan_data.drop('Default', axis=1)
y = loan_data['Default']

X_res, y_res = sm.fit_resample(X, y)

In [None]:
# Standardize numerical features
numerical_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore',
                  'MonthsEmployed', 'NumCreditLines', 'InterestRate',
                  'LoanTerm', 'DTIRatio']

In [None]:
scaler = StandardScaler()
X_res[numerical_cols] = scaler.fit_transform(X_res[numerical_cols])

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
# Check the processed data
print("Processed Data Shape:", X_train.shape, X_test.shape)
print("Class Distribution in Training Set:", y_train.value_counts())

Processed Data Shape: (60947, 25) (15237, 25)
Class Distribution in Training Set: Default
1.0    30515
0.0    30432
Name: count, dtype: int64


# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report for Logistic Regression:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8548926954124828
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

         0.0       0.84      0.87      0.86      7660
         1.0       0.87      0.84      0.85      7577

    accuracy                           0.85     15237
   macro avg       0.86      0.85      0.85     15237
weighted avg       0.86      0.85      0.85     15237



# RANDOM FOREST CLASSIFIER

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9232132309509746
Classification Report for Random Forest:
               precision    recall  f1-score   support

         0.0       0.89      0.96      0.93      7660
         1.0       0.96      0.88      0.92      7577

    accuracy                           0.92     15237
   macro avg       0.93      0.92      0.92     15237
weighted avg       0.93      0.92      0.92     15237



# SUPPORT VECTOR MACHINE (SVM)

In [None]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report for SVM:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.8962394172081118
Classification Report for SVM:
               precision    recall  f1-score   support

         0.0       0.86      0.95      0.90      7660
         1.0       0.94      0.84      0.89      7577

    accuracy                           0.90     15237
   macro avg       0.90      0.90      0.90     15237
weighted avg       0.90      0.90      0.90     15237



# DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report for Decision Tree:\n", classification_report(y_test, y_pred_dt))

Decision Tree Accuracy: 0.8493797991730656
Classification Report for Decision Tree:
               precision    recall  f1-score   support

         0.0       0.86      0.83      0.85      7660
         1.0       0.84      0.87      0.85      7577

    accuracy                           0.85     15237
   macro avg       0.85      0.85      0.85     15237
weighted avg       0.85      0.85      0.85     15237



# NEURAL NETWORK

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score

# Initialize the neural network model
nn_model = Sequential()

# Input layer (size of the input data)
nn_model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))

# Hidden layers
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dropout(0.5))  # Dropout to prevent overfitting

# Output layer
nn_model.add(Dense(1, activation='sigmoid'))  # Binary classification (loan default or not)

# Compile the model
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

# Early stopping to prevent overfitting (stop training when the validation accuracy stops improving)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the neural network model
history = nn_model.fit(X_train, y_train,
                       epochs=50,
                       batch_size=32,
                       validation_data=(X_test, y_test),
                       callbacks=[early_stopping],
                       verbose=1)

# Predict and evaluate the model
y_pred_nn = (nn_model.predict(X_test) > 0.5).astype('int32')  # Sigmoid output to binary

# Evaluate and print the performance
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Classification Report for Neural Network:\n", classification_report(y_test, y_pred_nn))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.7906 - loss: 0.4420 - val_accuracy: 0.8716 - val_loss: 0.2965
Epoch 2/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.8649 - loss: 0.3094 - val_accuracy: 0.8757 - val_loss: 0.2825
Epoch 3/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8697 - loss: 0.2961 - val_accuracy: 0.8764 - val_loss: 0.2778
Epoch 4/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.8731 - loss: 0.2893 - val_accuracy: 0.8783 - val_loss: 0.2719
Epoch 5/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8747 - loss: 0.2826 - val_accuracy: 0.8810 - val_loss: 0.2694
Epoch 6/50
[1m1905/1905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8759 - loss: 0.2823 - val_accuracy: 0.8775 - val_loss: 0.2679
Epoch 7/50
[1m1905/1905

# K-NEAREST NEIGHBORS (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_knn = knn_model.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report for KNN:\n", classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.8868543676576754
Classification Report for KNN:
               precision    recall  f1-score   support

         0.0       0.95      0.82      0.88      7660
         1.0       0.84      0.96      0.89      7577

    accuracy                           0.89     15237
   macro avg       0.89      0.89      0.89     15237
weighted avg       0.89      0.89      0.89     15237



# NAIVE BAYES CLASSIFIER

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize and train the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report for Naive Bayes:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.8575178840979195
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

         0.0       0.85      0.87      0.86      7660
         1.0       0.86      0.85      0.86      7577

    accuracy                           0.86     15237
   macro avg       0.86      0.86      0.86     15237
weighted avg       0.86      0.86      0.86     15237

