# <center> Credit Score Classification

### Problem Statement
- You are working as a data scientist in a global finance company. Over the years, the company has collected basic bank details and gathered a lot of credit-related information. The management wants to build an intelligent system to segregate the people into credit score brackets to reduce the manual efforts.

### Task
- Given a person’s credit-related information, build a machine learning model that can classify the credit score.

## Table of content 
- Packages importing 
- Reading Data 
- Data Exploration
- Data cleaning 
- Data Preprocessing 
- Modeling & Evaluation


## Packages importing 

In [2]:
# Packages for EDA 
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np 
%matplotlib inline


# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from datasist.structdata import detect_outliers
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import category_encoders as ce
import re 

# Modeling and evaluation 
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report 
import joblib

# Packages options 
sns.set(rc={'figure.figsize': [14, 7]}, font_scale=1.2) # Standard figure size for all 
np.seterr(divide='ignore', invalid='ignore', over='ignore') ;

import warnings 
warnings.filterwarnings("ignore")



## Reading Data 

In [3]:
#importing the credit classification dataset
df = pd.read_csv('/Users/arsalankhan/Documents/ML LAB/ML CS/credit.csv')

In [4]:
df.sample(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
27237,46487,38205,6,Ny,24.0,884632579.0,Engineer,37419.7,3174.308333,1.0,...,Good,653.66,28.773965,265.0,No,76.182536,48.617468,High_spent_Large_value_payments,414.234774,Good
93895,146473,36958,8,Jim Finkley,19.0,332196682.0,Musician,18784.375,1574.364583,5.0,...,Good,55.2,29.940498,296.0,No,11.689078,55.736403,Low_spent_Small_value_payments,373.463018,Standard
94649,147607,1503,2,Palmerz,42.0,612360562.0,Accountant,130079.79,11091.9825,3.0,...,Good,230.44,28.959905,267.0,No,0.0,131.532409,High_spent_Large_value_payments,1136.359407,Good
82922,130016,23502,3,A.j,29.0,695627237.0,Writer,18949.48,1569.123333,0.0,...,Good,1002.97,33.57145,322.0,No,27.257367,30.609033,Low_spent_Small_value_payments,338.131983,Good
79282,124556,14077,3,Erwin Sebaq,17.0,90607501.0,Doctor,20864.93,1484.744167,5.0,...,Standard,1466.97,37.515947,223.0,Yes,66.288738,51.320373,Low_spent_Small_value_payments,274.730315,Poor
37178,61400,32605,3,Valentina Zal,28.0,556876020.0,Teacher,20615.79,1761.9825,6.0,...,Standard,1901.5,39.598415,143.0,Yes,72.105784,12.721067,High_spent_Small_value_payments,278.676119,Poor
86494,135372,6891,7,Dane,19.0,256190938.0,Teacher,38253.64,2997.712061,1.0,...,Good,843.89,35.771925,257.0,No,395.782037,44.429097,High_spent_Small_value_payments,394.247337,Good
703,6685,29152,8,Aruna Viswanathau,28.0,330960638.0,Mechanic,15398.95,1548.245833,1.0,...,Good,343.84,28.135688,346.0,No,34.81397,45.39013,Low_spent_Large_value_payments,294.624051,Standard
7037,16187,14665,6,Scotto,41.0,908890498.0,Musician,63353.68,5356.473333,9.0,...,Bad,4362.52,20.719745,15.0,Yes,390.451288,36.295167,Low_spent_Large_value_payments,67.975307,Standard
9677,20147,36559,6,Abboudq,35.0,879563473.0,Journalist,70978.44,5993.87,6.0,...,Standard,1587.98,36.398787,172.0,NM,262.831566,51.466421,Low_spent_Medium_value_payments,62.958054,Poor


## Data Exploration

In [5]:
df.shape

(100000, 28)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [7]:
df.describe()

Unnamed: 0,ID,Customer_ID,Month,Age,SSN,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,...,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Total_EMI_per_month,Amount_invested_monthly,Monthly_Balance
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,80631.5,25982.66664,4.5,33.31634,500461700.0,50505.123449,4197.270835,5.36882,5.53357,14.53208,...,21.08141,13.31312,10.470323,5.79825,1426.220376,32.285173,221.22046,107.699208,55.101315,392.697586
std,43301.486619,14340.543051,2.291299,10.764812,290826700.0,38299.422093,3186.432497,2.593314,2.067098,8.74133,...,14.80456,6.237166,6.609481,3.867826,1155.129026,5.116875,99.680716,132.267056,39.006932,201.652719
min,5634.0,1006.0,1.0,14.0,81349.0,7005.93,303.645417,0.0,0.0,1.0,...,0.0,0.0,0.5,0.0,0.23,20.0,1.0,0.0,0.0,0.00776
25%,43132.75,13664.5,2.75,24.0,245168600.0,19342.9725,1626.594167,3.0,4.0,7.0,...,10.0,9.0,5.38,3.0,566.0725,28.052567,144.0,29.268886,27.959111,267.615983
50%,80631.5,25777.0,4.5,33.0,500688600.0,36999.705,3095.905,5.0,5.0,13.0,...,18.0,14.0,9.4,5.0,1166.155,32.305784,219.0,66.462304,45.15655,333.865366
75%,118130.25,38385.0,6.25,42.0,756002700.0,71683.47,5957.715,7.0,7.0,20.0,...,28.0,18.0,14.85,8.0,1945.9625,36.496663,302.0,147.392573,71.295797,463.215683
max,155629.0,50999.0,8.0,56.0,999993400.0,179987.28,15204.633333,11.0,11.0,34.0,...,62.0,25.0,29.98,17.0,4998.07,50.0,404.0,1779.103254,434.191089,1183.930696


In [8]:
df.duplicated().sum()

0

In [9]:
sns.countplot(df['Occupation'],palette="mako");
plt.xticks(rotation=45);

In [10]:
sns.countplot(df['Credit_Mix'],palette="mako")
plt.xticks(rotation=45)

(array([    0., 10000., 20000., 30000., 40000., 50000.]),
 [Text(0.0, 0, '0'),
  Text(10000.0, 0, '10000'),
  Text(20000.0, 0, '20000'),
  Text(30000.0, 0, '30000'),
  Text(40000.0, 0, '40000'),
  Text(50000.0, 0, '50000')])

In [11]:
df['Credit_Score'].value_counts(normalize=True)

Credit_Score
Standard    0.53174
Poor        0.28998
Good        0.17828
Name: proportion, dtype: float64

In [12]:
df['Type_of_Loan'].value_counts().head(10)

Type_of_Loan
No Data                    11408
Not Specified               1408
Credit-Builder Loan         1280
Personal Loan               1272
Debt Consolidation Loan     1264
Student Loan                1240
Payday Loan                 1200
Mortgage Loan               1176
Auto Loan                   1152
Home Equity Loan            1136
Name: count, dtype: int64

### Data Spliting 
- Try Resampling 

# Exploratory Data Analysis

In [13]:

# Set a vibrant color palette
sns.set_palette("husl")
sns.set(style="whitegrid")

# Create a figure and 5x3 grid of subplots
fig, ax = plt.subplots(5, 3, figsize=(16, 25))

# Flatten the 2D array of subplots into a 1D array for easier iteration
ax = ax.flatten()

# Define list of features to plot
features = [
    'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
    'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
    'Amount_invested_monthly', 'Monthly_Balance', 'Age'
]

# Loop through each feature and create box plots with Credit_Score assigned to hue
for i, feature in enumerate(features):
    sns.boxplot(y=feature, x='Credit_Score', data=df, hue='Credit_Score', ax=ax[i], palette="coolwarm", dodge=False, legend=False)
    ax[i].set_title(f'{feature.replace("_", " ")} vs Credit Score', fontsize=14, fontweight='bold')
    ax[i].set_xlabel('Credit Score', fontsize=12)
    ax[i].set_ylabel('Feature Value', fontsize=12)
    ax[i].grid(color='white', linestyle='-', linewidth=1, alpha=0.6)

# Adjust layout and add a title for the entire figure
fig.suptitle('Relationship between Credit Score and Various Financial Features', fontsize=20, fontweight='bold', color='darkblue')
fig.subplots_adjust(hspace=0.4, wspace=0.4)





#  Count Plot for Credit Score Distribution

plt.figure(figsize=(7, 4))
sns.countplot(hue='Credit_Score', x='Credit_Score', data=df, palette='magma', dodge=False, legend=False)
plt.title('Distribution of Credit Score', fontsize=16, fontweight='bold', color='darkviolet')
plt.xlabel('Credit Score')
plt.ylabel('Count')
plt.grid(color='white', linestyle='--', linewidth=0.7, alpha=0.5)



#  KDE Plot for Age and Outstanding Debt by Credit Score
plt.figure(figsize=(7, 4))
sns.kdeplot(data=df, x='Age', hue='Credit_Score', fill=True, palette="cool", alpha=0.6)
plt.title('Age Distribution by Credit Score', fontsize=16, fontweight='bold', color='darkorange')
plt.xlabel('Age')
plt.ylabel('Density')
plt.grid(color='white', linestyle='--', linewidth=0.7, alpha=0.5)
plt.show()

plt.figure(figsize=(7, 4))
sns.kdeplot(data=df, x='Outstanding_Debt', hue='Credit_Score', fill=True, palette="cool", alpha=0.6)
plt.title('Outstanding Debt Distribution by Credit Score', fontsize=16, fontweight='bold', color='darkorange')
plt.xlabel('Outstanding Debt')
plt.ylabel('Density')
plt.show()

# Show all the box plots in the figure
plt.show()

From the 15 box plots above, we can deduce the following:

1. The more someone earns anually, the better their credit score is.
2. Similar to annual income, a higher monthly in-hand salary leads to a better credit score.
3. The ideal number of bank accounts is 2 - 4. Having more than 5 negatively affects your credit score.
4. Similar to bank accounts, having more than 5 credit cards will negatively affect your credit scores. The ideal number is 3-5.
5. 4 - 11% is the sweet spot for average instest rate. Anything above 15% is a no-no.
6. Take 1-3 loans at a time inorder to keep a good credit score. Having more than 3 loans negatively impacts credtit scores. 
7. To maintain a good credit score, you have a 5-14 day delay window. Delaying for more than 17 days affects your credit score negatively.
8. Delaying 4-12 payments from the due date is the safety window. Anything above 12 payments negatively affexts credit scores.
9. An outstanding debt of $380 – $1150 will not affect your credit scores, but going above $1338 affects your credit scores negatively.
10. Your credit utilization ratio doesn’t affect your credit scores.
11. Having a long credit history results in better credit scores.
12. The number of EMIs you are paying in a month doesn’t affect credit scores that much.
13. How much you invest monthly doesn’t really affect your credit scores. 
14. Having a high monthly balance in your account at the end of the month is good for your credit scores.
15. Credit scores tend to improve with an increase in age.

**Split into train and test sets**

### Data Spliting 
- Try Resampling 

In [14]:
# define dataset
X, y = df.drop("Credit_Score",axis=1).values , df["Credit_Score"] 

## Apply oversampling
- Data is Not 100% balanced, Let's try improving it. 

In [15]:
y.value_counts(normalize=True)

Credit_Score
Standard    0.53174
Poor        0.28998
Good        0.17828
Name: proportion, dtype: float64

In [16]:
from sklearn.preprocessing import OneHotEncoder

# Create an imputer transformer to handle missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# Encode the non-numeric data using one-hot encoding
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X_imputed)

# Apply SMOTE to the encoded data
rus = SMOTE(sampling_strategy='auto')
X_data_rus, y_data_rus = rus.fit_resample(X_encoded, y)


In [None]:
y_data_rus.value_counts(normalize=True)

Credit_Score
Good        0.333333
Standard    0.333333
Poor        0.333333
Name: proportion, dtype: float64

In [None]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data_rus, y_data_rus, test_size=0.3, random_state=42,stratify=y_data_rus)

### Handling Numerical 
- Using Power transformer to avoid Data Skewness

In [None]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train.toarray())


NameError: name 'PowerTransformer' is not defined

In [None]:
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

### Handling Numerical 
- Using Power transformer to avoid Data Skewness

In [None]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train)

NameError: name 'X_train' is not defined

In [None]:
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

# Modeling and Evaluation


#### Model Building

In [None]:
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

model = StackingClassifier([
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1)

#### Model fitting

In [None]:
model.fit(X_train, y_train)

#### Model evaluation

In [None]:
print("Train Score: ",model.score(X_train, y_train))

Train Score:  0.9991134196032777


In [None]:
print("Test Score: ",model.score(X_test, y_test))

Test Score:  0.8670622897381783


In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87     15911
           1       0.81      0.82      0.82     15882
           2       0.92      0.91      0.91     16064

    accuracy                           0.87     47857
   macro avg       0.87      0.87      0.87     47857
weighted avg       0.87      0.87      0.87     47857



#### model Saving

In [None]:
#joblib.dump(model,'model.h5')
#joblib.dump(scalar,'scalar.h5')