In [2]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import math
import os


In [3]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [3]:
# Create directories if they don't exist
os.makedirs("models/labelEncoders", exist_ok=True)
os.makedirs("models/ordinalEncoders", exist_ok=True)

In [5]:
os.chdir("D:/cboobs/creditScore-Prediction/creditScore-Prediction")

In [6]:
df4 = pd.read_csv("data/cleaned_credit_FR_age_18_60.csv")
df5 = df4.copy()
df6 = df4.copy()

### Label Encoding

In [6]:
# label encoding categorical features

catCols = df5.select_dtypes(include='object').columns.drop('Credit_Score')
leDict = {}

for col in catCols:
    le = LabelEncoder()
    df5[col] = le.fit_transform(df5[col])
    leDict[col] = le
    joblib.dump(le, f"models/labelEncoders/{col}_label_encoder.pkl")

In [7]:
# Label Encoding Target

leTarget = LabelEncoder()
df5['Credit_Score'] = leTarget.fit_transform(df5['Credit_Score'])  
joblib.dump(leTarget, f"models/labelEncoders/Target_label_encoder.pkl")

['models/labelEncoders/Target_label_encoder.pkl']

In [8]:
df5.Credit_Score.value_counts()

2    35125
1    18463
0    12174
Name: Credit_Score, dtype: int64

In [12]:
catCols = df5.select_dtypes(include='object').columns.drop('Credit_Score')
catCols

Index(['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount',
       'Payment_Behaviour'],
      dtype='object')

In [13]:
df4.Credit_Score.value_counts()

Standard    35125
Poor        18463
Good        12174
Name: Credit_Score, dtype: int64

In [14]:
leTarget.classes_

array(['Good', 'Poor', 'Standard'], dtype=object)

In [9]:
# Training the data

x = df5.drop('Credit_Score', axis=1)
y = df5['Credit_Score']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# training function


def trainModel(model, paramGrid, x_train, y_train, x_test, y_test):
    grid = GridSearchCV(model, paramGrid, cv=5, scoring='accuracy', n_jobs=2)
    grid.fit(x_train, y_train)
    bestModel = grid.best_estimator_
    yPred = bestModel.predict(x_test)
    acc = accuracy_score(y_test, yPred)
    print("Best Parameters: ", grid.best_params_)
    print("Accuracy: ", acc)

    return bestModel, acc, grid.best_params_

In [13]:
tree1 = DecisionTreeClassifier(random_state=42, class_weight='balanced')
tree1_paramGrid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

bestTree1, tree1Acc, tree1BestParams = trainModel(tree1, tree1_paramGrid, x_train, y_train, x_test, y_test)

Best Parameters:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy:  0.6824298639093743


In [14]:
rf = RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=2)
rf_paramGrid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

bestRF, rfAcc, rfBestParams = trainModel(rf, rf_paramGrid, x_train, y_train, x_test, y_test)

Best Parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy:  0.7737398312172128


In [15]:
tree2 = ExtraTreesClassifier(random_state=42, class_weight='balanced', n_jobs=1)
tree2_paramGrid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

bestTree2, tree2Acc, tree2BestParams = trainModel(tree2, tree2_paramGrid, x_train, y_train, x_test, y_test)

Best Parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy:  0.7423401505359994


In [16]:
xgb = XGBClassifier(random_state=42, scale_pos_weight= (y_train == 0).sum() / (y_train == 1).sum(), use_label_encoder=False, eval_metric='logloss')
xgb_paramGrid = {
    'n_estimators': [100, 200],
    'max_depth': [3,5,7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 1.0],
    "colsample_bytree": [0.7, 1.0]
}

bestXGB, xgbAcc, xgbBestParams = trainModel(xgb, xgb_paramGrid, x_train, y_train, x_test, y_test)


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

Parame

Best Parameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Accuracy:  0.7674294837679617


In [4]:
# save best model
joblib.dump(bestRF, "models/best_model_rf.pkl")

NameError: name 'bestRF' is not defined

In [8]:
for col in df5.describe(include='object').columns:
    print('Column Name: ',col)
    print(df5[col].unique())
    print('='*60 )

Column Name:  Month
['January' 'July' 'August' 'February' 'March' 'May' 'June' 'April']
Column Name:  Occupation
['Scientist' 'Unknown' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']
Column Name:  Credit_Mix
['Unknown' 'Good' 'Standard' 'Bad']
Column Name:  Payment_of_Min_Amount
['No' 'NM' 'Yes']
Column Name:  Payment_Behaviour
['High_spent_Small_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' 'High_spent_Large_value_payments'
 'Unknown' 'Low_spent_Medium_value_payments'
 'Low_spent_Large_value_payments']
Column Name:  Credit_Score
['Good' 'Standard' 'Poor']


In [10]:
for col in df5.describe(include=['Float64', 'int64', 'float64']).columns:
    print('Column Name: ',col)
    # print(df5[col].unique())
    print('='*60 )

Column Name:  Age
Column Name:  Annual_Income
Column Name:  Monthly_Inhand_Salary
Column Name:  Num_Bank_Accounts
Column Name:  Num_Credit_Card
Column Name:  Interest_Rate
Column Name:  Num_of_Loan
Column Name:  Delay_from_due_date
Column Name:  Num_of_Delayed_Payment
Column Name:  Changed_Credit_Limit
Column Name:  Num_Credit_Inquiries
Column Name:  Outstanding_Debt
Column Name:  Credit_Utilization_Ratio
Column Name:  Total_EMI_per_month
Column Name:  Amount_invested_monthly
Column Name:  Monthly_Balance


### Ordinal Encoding

In [9]:
for col in df6.select_dtypes(include='object').columns:
    print('Column Name: ',col)
    print(df6[col].unique())
    print('='*60 )



Column Name:  Month
['January' 'July' 'August' 'February' 'March' 'May' 'June' 'April']
Column Name:  Occupation
['Scientist' 'Unknown' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']
Column Name:  Credit_Mix
['Unknown' 'Good' 'Standard' 'Bad']
Column Name:  Payment_of_Min_Amount
['No' 'NM' 'Yes']
Column Name:  Payment_Behaviour
['High_spent_Small_value_payments' 'Low_spent_Small_value_payments'
 'High_spent_Medium_value_payments' 'High_spent_Large_value_payments'
 'Unknown' 'Low_spent_Medium_value_payments'
 'Low_spent_Large_value_payments']
Column Name:  Credit_Score
['Good' 'Standard' 'Poor']


In [17]:
oeMonth = OrdinalEncoder(categories=[['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']], handle_unknown='use_encoded_value', unknown_value=-1)
oeOccupation = OrdinalEncoder(categories=[['Scientist', 'Unknown', 'Teacher', 'Engineer', 'Entrepreneur', 
                                         'Developer','Lawyer', 'Media_Manager', 'Doctor', 'Journalist', 
                                         'Manager', 'Accountant','Musician', 'Mechanic', 'Writer', 'Architect']], handle_unknown='use_encoded_value', unknown_value=-1)

oeCreditMix = OrdinalEncoder(categories=[["Unknown", 'Poor', 'Standard', 'Good']], handle_unknown='use_encoded_value', unknown_value=-1)
oePayMinAmt = OrdinalEncoder(categories=[['No','NM','Yes']], handle_unknown='use_encoded_value', unknown_value=-1)
oePaymentBehaviour = OrdinalEncoder(categories=[['Unknown','Low_spent_Small_value_payments', 'Low_spent_Medium_value_payments', 'Low_spent_Large_value_payments', 
                                                'High_spent_Small_value_payments','High_spent_Medium_value_payments', 'High_spent_Large_value_payments']], handle_unknown='use_encoded_value', unknown_value=-1)


In [5]:
oeTarget = OrdinalEncoder(categories=[['Poor', 'Standard', 'Good']], handle_unknown='use_encoded_value', unknown_value=-1)
df6['Credit_Score'] = oeTarget.fit_transform(df6[['Credit_Score']]).ravel().astype(int)
joblib.dump(oeTarget, "models/ordinalEncoders/Target_ordinal_encoder.pkl")

print("Target ordinal encoder has been applied and saved successfully!")

['models/ordinalEncoders/Target_ordinal_encoder.pkl']