In [39]:
#importing all the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [40]:
#Load the dataset
df = pd.read_csv(r'C:\Users\bhavi\OneDrive\Desktop\projects\dataset.csv')

In [41]:
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [50]:
# Feature and target selection
X = df[['Num_of_Delayed_Payment', 'Credit_Utilization_Ratio', 'Outstanding_Debt', 'Annual_Income']]
y = df['Credit_Score']

In [51]:
#Label encoding for the target variable
label_encoder = LabelEncoder()
df['Credit_Score_Encoded'] = label_encoder.fit_transform(df['Credit_Score'])

In [52]:
#Creating column transformer
preprocessor = ColumnTransformer(
    transformers = [('num', StandardScaler(),['Num_of_Delayed_Payment', 'Credit_Utilization_Ratio', 'Outstanding_Debt', 'Annual_Income'])
    ]
)

In [53]:
#Creating pipeline for logistic regresion
pipeline_lr = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classsifier', LogisticRegression())
])

In [54]:
#Creating pipeline
pipeline_rf = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classsifier', RandomForestClassifier(random_state=42))
])

In [55]:
#Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
#Training Logistic Regression
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)

In [58]:
# Training Random Forest
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

In [59]:
# Evaluation
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr, target_names=label_encoder.classes_))

Logistic Regression:
Accuracy: 0.54075
              precision    recall  f1-score   support

        Good       0.48      0.22      0.30      3527
        Poor       0.49      0.25      0.33      5874
    Standard       0.56      0.81      0.66     10599

    accuracy                           0.54     20000
   macro avg       0.51      0.43      0.43     20000
weighted avg       0.52      0.54      0.50     20000



In [60]:
print("\nRandom Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Random Forest:
Accuracy: 0.7397
              precision    recall  f1-score   support

        Good       0.69      0.58      0.63      3527
        Poor       0.75      0.75      0.75      5874
    Standard       0.75      0.79      0.77     10599

    accuracy                           0.74     20000
   macro avg       0.73      0.71      0.72     20000
weighted avg       0.74      0.74      0.74     20000



In [61]:
import pickle

# Save the Random Forest pipeline as a .pkl file
with open('random_forest_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline_rf, file)
print("Random Forest model saved as 'random_forest_pipeline.pkl'.")


Random Forest model saved as 'random_forest_pipeline.pkl'.
