# MODEL SELECTION TRAINING AND EVALUATION

In [86]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [60]:
# Load the dataset
df = pd.read_csv('credit.csv')

# FEATURE ENGINEEREING

In [61]:
# List of unwanted features
unwanted_features = ['ID', 'Customer_ID', 'Name', 'Age', 'SSN', 'Type_of_Loan', 'Num_Credit_Inquiries']

In [62]:
# Drop unwanted features
df = df.drop(unwanted_features, axis=1)

# Handle outliers

In [63]:
# Find the number of outliers in each numerical column
numerical_cols = ['Month','Annual_Income','Monthly_Inhand_Salary','Num_Bank_Accounts','Num_Credit_Card','Interest_Rate',
                  'Delay_from_due_date','Num_of_Delayed_Payment','Num_of_Loan','Changed_Credit_Limit',
                  'Credit_Utilization_Ratio','Outstanding_Debt','Credit_History_Age',
                  'Total_EMI_per_month', 'Amount_invested_monthly']

In [64]:
num_outliers = []
for col in numerical_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 * iqr
    outliers = df[(df[col] < lower_limit) | (df[col] > upper_limit)].shape[0]
    num_outliers.append(outliers)

print("Number of outliers in each column:")
for i, col in enumerate(numerical_cols):
    print(f"{col}: {num_outliers[i]} outliers detected")

Number of outliers in each column:
Month: 0 outliers detected
Annual_Income: 2000 outliers detected
Monthly_Inhand_Salary: 2017 outliers detected
Num_Bank_Accounts: 0 outliers detected
Num_Credit_Card: 0 outliers detected
Interest_Rate: 0 outliers detected
Delay_from_due_date: 4002 outliers detected
Num_of_Delayed_Payment: 0 outliers detected
Num_of_Loan: 0 outliers detected
Changed_Credit_Limit: 579 outliers detected
Credit_Utilization_Ratio: 4 outliers detected
Outstanding_Debt: 5272 outliers detected
Credit_History_Age: 0 outliers detected
Total_EMI_per_month: 5044 outliers detected
Amount_invested_monthly: 4464 outliers detected


In [65]:
# Cap outlier values
# Calculate the upper and lower limits for outlier capping
upper_limits = {}
lower_limits = {}
for col in numerical_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    upper_limits[col] = q3 + 1.5 * iqr
    lower_limits[col] = q1 - 1.5 * iqr

# Cap outlier values
for col in numerical_cols:
    df[col] = np.where(df[col] > upper_limits[col], upper_limits[col], df[col])
    df[col] = np.where(df[col] < lower_limits[col], lower_limits[col], df[col])

In [66]:
# Display the remaining outliers after processing
print("\nOutliers after processing:")
num_outliers = []
for col in numerical_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_limit = q1 - 1.5 * iqr
    upper_limit = q3 + 1.5 * iqr
    outliers = df[(df[col] < lower_limit) | (df[col] > upper_limit)].shape[0]
    num_outliers.append(outliers)

print("Number of outliers in each column:")
for i, col in enumerate(numerical_cols):
    print(f"{col}: {num_outliers[i]} outliers detected")


Outliers after processing:
Number of outliers in each column:
Month: 0 outliers detected
Annual_Income: 0 outliers detected
Monthly_Inhand_Salary: 0 outliers detected
Num_Bank_Accounts: 0 outliers detected
Num_Credit_Card: 0 outliers detected
Interest_Rate: 0 outliers detected
Delay_from_due_date: 0 outliers detected
Num_of_Delayed_Payment: 0 outliers detected
Num_of_Loan: 0 outliers detected
Changed_Credit_Limit: 0 outliers detected
Credit_Utilization_Ratio: 0 outliers detected
Outstanding_Debt: 0 outliers detected
Credit_History_Age: 0 outliers detected
Total_EMI_per_month: 0 outliers detected
Amount_invested_monthly: 0 outliers detected


# One-hot encoding

In [67]:
categorical_cols = ['Occupation','Delay_from_due_date', 'Num_of_Delayed_Payment',
                    'Credit_Mix','Payment_Behaviour','Payment_of_Min_Amount']

In [68]:
df = pd.get_dummies(df, columns=categorical_cols)

# Label Encoding

In [69]:
target_column = 'Credit_Score'
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column])
labels_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding:")
for label, code in labels_mapping.items():
    print(f"{label}: {code}")

Label Encoding:
Good: 0
Poor: 1
Standard: 2


# MODEL SELECTION AND TRAINING

# Train Test Split 

In [85]:
# Divide the data into training, validation, and testing sets
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Random Forest Classifier model

In [71]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit a RandomForestClassifier model
rf_model.fit(X_train, y_train)

# Predict the target variable for X_test
y_test_pred = rf_model.predict(X_test)

# Display accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print("\nRandom Forest Classifier Accuracy: {:.2f}%".format(accuracy * 100))



Random Forest Classifier Accuracy: 80.81%


# Gradient Boosting Classifier model

In [72]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting Classifier model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit a Gradient Boosting Classifier model
gb_model.fit(X_train, y_train)

# Predict the target variable for X_test
y_test_pred = gb_model.predict(X_test)

# Display accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print("\nGradient Boosting Classifier Accuracy: {:.2f}%".format(accuracy * 100))


Gradient Boosting Classifier Accuracy: 70.85%


# Decision Tree Classifier model

In [73]:
from sklearn.tree import DecisionTreeClassifier


# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Fit the Decision Tree Classifier to the training data
dt_model.fit(X_train, y_train)

# Predict the target variable for X_test
y_test_pred = dt_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Display accuracy
print("\nDecision Tree Classifier Accuracy: {:.2f}%".format(accuracy * 100))



Decision Tree Classifier Accuracy: 72.84%


# Support Vector Classifier model (SVC)

In [77]:
from sklearn.svm import SVC

# Initialize the Support Vector Classifier (SVC)
svc_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Fit the SVC model to the training data
svc_model.fit(X_train, y_train)

# Predict the target variable for X_test
y_test_pred = svc_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Display accuracy
print("\nSupport Vector Classifier Accuracy: {:.2f}%".format(accuracy * 100))


Support Vector Classifier Accuracy: 53.66%


# XGBoost model

In [76]:
import xgboost as xgb

# Convert data to DMatrix format, which is optimized for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Specify hyperparameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': 3,  # Number of classes (replace with your actual number of classes)
    'max_depth': 6,  # Maximum depth of trees
    'eta': 0.3,  # Learning rate
    'gamma': 0,  # Minimum loss reduction required to make a further partition
    'subsample': 0.8,  # Subsample ratio of the training instances
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
    'eval_metric': 'merror'  # Evaluation metric (merror for multi-class classification error)
}

# Train the XGBoost model
num_rounds = 100  # Number of boosting rounds
bst = xgb.train(params, dtrain, num_rounds)

# Make predictions on the test data
y_test_pred = bst.predict(dtest)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Display accuracy
print("\nXGBoost Classifier Accuracy: {:.2f}%".format(accuracy * 100))



XGBoost Classifier Accuracy: 76.50%


# Naive Bayes classifier Model

In [75]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
nb_model = GaussianNB()

# Train the Naive Bayes classifier on the training data
nb_model.fit(X_train, y_train)

# Predict the target variable for X_test
y_test_pred = nb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)

# Display accuracy
print("\nNaive Bayes Classifier Accuracy: {:.2f}%".format(accuracy * 100))


Naive Bayes Classifier Accuracy: 60.19%


# MODEL EVALUATION

# Accuracy for each model

In [79]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for each model
accuracy_rf = accuracy_score(y_test, rf_model.predict(X_test))
accuracy_gb = accuracy_score(y_test, gb_model.predict(X_test))
accuracy_dt = accuracy_score(y_test, dt_model.predict(X_test))
accuracy_svc = accuracy_score(y_test, svc_model.predict(X_test))
accuracy_xgb = accuracy_score(y_test, bst.predict(dtest))
accuracy_nb = accuracy_score(y_test, nb_model.predict(X_test))

print("Random Forest Classifier Accuracy: {:.2f}%".format(accuracy_rf * 100))
print("Gradient Boosting Classifier Accuracy: {:.2f}%".format(accuracy_gb * 100))
print("Decision Tree Classifier Accuracy: {:.2f}%".format(accuracy_dt * 100))
print("Support Vector Classifier Accuracy: {:.2f}%".format(accuracy_svc * 100))
print("XGBoost Classifier Accuracy: {:.2f}%".format(accuracy_xgb * 100))
print("Naive Bayes Classifier Accuracy: {:.2f}%".format(accuracy_nb * 100))


Random Forest Classifier Accuracy: 80.81%
Gradient Boosting Classifier Accuracy: 70.85%
Decision Tree Classifier Accuracy: 72.84%
Support Vector Classifier Accuracy: 53.66%
XGBoost Classifier Accuracy: 76.50%
Naive Bayes Classifier Accuracy: 60.19%


# Confusion matrix for each model

In [80]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix for each model
cm_rf = confusion_matrix(y_test, rf_model.predict(X_test))
cm_gb = confusion_matrix(y_test, gb_model.predict(X_test))
cm_dt = confusion_matrix(y_test, dt_model.predict(X_test))
cm_svc = confusion_matrix(y_test, svc_model.predict(X_test))
cm_nb = confusion_matrix(y_test, nb_model.predict(X_test))

print("Confusion Matrix for Random Forest Classifier:")
print(cm_rf)
print("\nConfusion Matrix for Gradient Boosting Classifier:")
print(cm_gb)
print("\nConfusion Matrix for Decision Tree Classifier:")
print(cm_dt)
print("\nConfusion Matrix for Support Vector Classifier:")
print(cm_svc)
print("\nConfusion Matrix for Naive Bayes Classifier:")
print(cm_nb)


Confusion Matrix for Random Forest Classifier:
[[2733    4  827]
 [ 122 4747  990]
 [ 761 1133 8683]]

Confusion Matrix for Gradient Boosting Classifier:
[[2423   21 1120]
 [ 539 3572 1748]
 [1297 1105 8175]]

Confusion Matrix for Decision Tree Classifier:
[[2320  172 1072]
 [ 151 4165 1543]
 [1037 1456 8084]]

Confusion Matrix for Support Vector Classifier:
[[   0   70 3494]
 [   0 1738 4121]
 [   0 1583 8994]]

Confusion Matrix for Naive Bayes Classifier:
[[2933   98  533]
 [1039 4199  621]
 [3146 2525 4906]]


# Precision for each model

In [84]:
from sklearn.metrics import precision_score

# Calculate precision for each model
precision_rf = precision_score(y_test, rf_model.predict(X_test), average='weighted', zero_division=0)
precision_gb = precision_score(y_test, gb_model.predict(X_test), average='weighted', zero_division=0)
precision_dt = precision_score(y_test, dt_model.predict(X_test), average='weighted', zero_division=0)
precision_svc = precision_score(y_test, svc_model.predict(X_test), average='weighted', zero_division=0)
precision_xgb = precision_score(y_test, bst.predict(dtest), average='weighted', zero_division=0)
precision_nb = precision_score(y_test, nb_model.predict(X_test), average='weighted', zero_division=0)

print("Precision for Random Forest Classifier: {:.2f}".format(precision_rf))
print("Precision for Gradient Boosting Classifier: {:.2f}".format(precision_gb))
print("Precision for Decision Tree Classifier: {:.2f}".format(precision_dt))
print("Precision for Support Vector Classifier: {:.2f}".format(precision_svc))
print("Precision for XGBoost Classifier: {:.2f}".format(precision_xgb))
print("Precision for Naive Bayes Classifier: {:.2f}".format(precision_nb))

Precision for Random Forest Classifier: 0.81
Precision for Gradient Boosting Classifier: 0.72
Precision for Decision Tree Classifier: 0.73
Precision for Support Vector Classifier: 0.44
Precision for XGBoost Classifier: 0.77
Precision for Naive Bayes Classifier: 0.68


# Recall for each model

In [82]:
from sklearn.metrics import recall_score

# Calculate recall for each model
recall_rf = recall_score(y_test, rf_model.predict(X_test), average='weighted')
recall_gb = recall_score(y_test, gb_model.predict(X_test), average='weighted')
recall_dt = recall_score(y_test, dt_model.predict(X_test), average='weighted')
recall_svc = recall_score(y_test, svc_model.predict(X_test), average='weighted')
recall_xgb = recall_score(y_test, bst.predict(dtest), average='weighted')
recall_nb = recall_score(y_test, nb_model.predict(X_test), average='weighted')

print("\nRecall for Random Forest Classifier: {:.2f}".format(recall_rf))
print("Recall for Gradient Boosting Classifier: {:.2f}".format(recall_gb))
print("Recall for Decision Tree Classifier: {:.2f}".format(recall_dt))
print("Recall for Support Vector Classifier: {:.2f}".format(recall_svc))
print("Recall for XGBoost Classifier: {:.2f}".format(recall_xgb))
print("Recall for Naive Bayes Classifier: {:.2f}".format(recall_nb))


Recall for Random Forest Classifier: 0.81
Recall for Gradient Boosting Classifier: 0.71
Recall for Decision Tree Classifier: 0.73
Recall for Support Vector Classifier: 0.54
Recall for XGBoost Classifier: 0.76
Recall for Naive Bayes Classifier: 0.60


# F1-score for each model

In [83]:
from sklearn.metrics import f1_score

# Calculate F1-score for each model
f1_rf = f1_score(y_test, rf_model.predict(X_test), average='weighted')
f1_gb = f1_score(y_test, gb_model.predict(X_test), average='weighted')
f1_dt = f1_score(y_test, dt_model.predict(X_test), average='weighted')
f1_svc = f1_score(y_test, svc_model.predict(X_test), average='weighted')
f1_xgb = f1_score(y_test, bst.predict(dtest), average='weighted')
f1_nb = f1_score(y_test, nb_model.predict(X_test), average='weighted')

print("\nF1-score for Random Forest Classifier: {:.2f}".format(f1_rf))
print("F1-score for Gradient Boosting Classifier: {:.2f}".format(f1_gb))
print("F1-score for Decision Tree Classifier: {:.2f}".format(f1_dt))
print("F1-score for Support Vector Classifier: {:.2f}".format(f1_svc))
print("F1-score for XGBoost Classifier: {:.2f}".format(f1_xgb))
print("F1-score for Naive Bayes Classifier: {:.2f}".format(f1_nb))


F1-score for Random Forest Classifier: 0.81
F1-score for Gradient Boosting Classifier: 0.71
F1-score for Decision Tree Classifier: 0.73
F1-score for Support Vector Classifier: 0.46
F1-score for XGBoost Classifier: 0.76
F1-score for Naive Bayes Classifier: 0.60


# After close evaluation 'Random Forest Classifier model' turns out to be the better model.