In [None]:
!git clone https://github.com/cesarlegendre/credit_scoring_7904_Q4_2024



# **Credit Scoring Prediction**


## Introduction
This notebook aims to predict whether an individual has a good credit score based on several factors. We'll transform the credit score into a binary classification problem and use various machine learning models to perform the prediction. The steps include:

* Data preprocessing and feature engineering
* Splitting the data for training and testing
* Training multiple classifiers with hyperparameter tuning
* Evaluating and comparing model performances



In [None]:
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Model selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold

# Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


file = 'credit_scoring_7904_Q4_2024/data_sets/credit_score/credit_score_small.csv'

np.random.seed(42)



# Loading the Data
We load the dataset

In [None]:
# Load data
df = pd.read_csv(file)


# Adding some randoness
df['Age'] = np.round(np.array(df['Age'].tolist()) + np.random.uniform(-5, 5, len(df)),2)
df['Income'] = np.round(np.array(df['Income'].tolist()) + np.random.uniform(-df['Income'].min(), df['Income'].min(), len(df)))

df.sample(4)

## Understanding the Data

This dataset contains information about a sample of over 100 people across the world. The data includes the following information:

* Age
* Gender
* Income
* Education
* Marital Status
* Number of Children
* Home Ownership
* Credit Score

## Data Preprocessing
### Transforming the Target Variable

We will transform the Credit_Score into a binary variable called Good_Credit:

* 1 if Credit_Score is High
* 0 otherwise

Create a new column called Good_Credit in your DataFrame.

* Use the .apply() function to transform the 'Credit Score' column.
* Set Good_Credit to 1 if the 'Credit Score' is 'High', otherwise set it to 0.
* Use a lambda function for the conditional logic

In [None]:
# exer
# df['Good_Credit'] =  # yout code here
# df['Good_Credit'].value_counts()
# solution 1
# ****

###  Encoding Categorical Variables


* Display the unique values in the Gender column.

* Create a new column Gender_Male:

* Display the unique values in the Marital Status column.

* Create a new column Marital_Status_Married

* Create a new column Marital_Status_Married

* Create a new column Home_Ownership_Owned



In [None]:
print(df['Gender'].unique())

df['Gender_Male'] = df['Gender'].map({'Female': 0, 'Male': 1}) # done for you

print(df['Marital Status'].unique())

# exersice df['Marital_Status_Married'] = # your code here

# solution 2
# ****
print(df['Home Ownership'].unique())


# exer
# df['Home_Ownership_Owned'] = # your code here

# solution 3
# ****

In [None]:
print(df['Education'].unique())

In [None]:
education_dummies = pd.get_dummies(df['Education'], prefix='Education', drop_first=True)
df = pd.concat([df, education_dummies], axis=1)

In [None]:
df.drop(['Gender', 'Marital Status', 'Home Ownership', 'Education', 'Credit Score'], axis=1, inplace=True)
df.head()


## Exploratory Data Analysis
### Checking for Missing Values

In [None]:
#check missing values, i.e. is null and sum

# solution 4
# ****


In [None]:
# describe the data set
# solution 5
# ****


### Distribution of Numerical Features

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
# plo sns histograme AGE
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Distribution of Age')

plt.subplot(1, 3, 2)

# exer : plot hist for income, bins =  20
# solution 6
# ****
plt.title('Distribution of Income')

plt.subplot(1, 3, 3)

# exer : plot hist for Number of children, bins =  4
# solution 7
# ****
plt.title('Distribution of Number of Children')

plt.tight_layout()
plt.show()


### Correlation Matrix

In [None]:
plt.figure(figsize=(12, 8))

# exer = plot correlation matrixs with anotation and cmap coolwarn
# solution 8
# ****
plt.title('Correlation Matrix')
plt.show()

## Feature Engineering



In [None]:
X = df.drop('Good_Credit', axis=1)
y = df['Good_Credit']


# exercise get the training test split, test size 20, use stratify = y and a random state
#X_train, X_test, y_train, y_test = train_test_split( #your code here)
# solution 9
# ****


kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Model Training and Hyperparameter Tuning

We will train the following classifiers:

* Logistic Regression
* Random Forest Classifier
* Gradient Boosting Classifie

### Logistic Regression

In [None]:
# exersice
# define a param grid with C =  [0.01, 0.1, 1, 5, 7] and solver = ['liblinear']
# param_grid_lr = # your code here, a dict

# solution 10
# ****


In [None]:
grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid_lr,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_lr.fit(X_train, y_train)


In [None]:
print("Best Hyperparameters for Logistic Regression:")
print(grid_search_lr.best_params_)


In [None]:

# excercise
# best_lr = # get the best predictor
# y_pred_lr =  # predict using X_text

# solution 11
# ****

print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

lr_accuracy = accuracy_score(y_test, y_pred_lr)


## Random Forest Classifier

In [None]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
}


random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=50,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random_search_rf.fit(X_train, y_train)

In [None]:
print("Best Hyperparameters for Random Forest:")
print(random_search_rf.best_params_)

In [None]:
best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Gradient Boosting

In [None]:
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
}


random_search_gb = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=param_grid_gb,
    n_iter=50,
    cv=kfold,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random_search_gb.fit(X_train, y_train)





In [None]:
print("Best Hyperparameters for Gradient Boosting:")
print(random_search_gb.best_params_)

In [None]:
best_gb = random_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)

print("\nClassification Report for Gradient Boosting:")
print(classification_report(y_test, y_pred_gb))

gb_accuracy = accuracy_score(y_test, y_pred_gb)


# Compare

In [None]:
models_accuracy = {
    'Logistic Regression': lr_accuracy,
    'Random Forest': rf_accuracy,
    'Gradient Boosting': gb_accuracy
}

print("\nAccuracy Scores:")
for model_name, accuracy in models_accuracy.items():
    print(f"{model_name}: {accuracy:.4f}")