In [None]:
!git clone https://github.com/cesarlegendre/credit_scoring_7904_Q4_2024


# Predicting Customer Credit Card Default

## Problem Description:
The goal of this project is to predict whether a customer will default on their credit card payments in Taiwan. From a risk management perspective, accurately estimating the probability of default is more valuable than simply classifying clients as either credible or non-credible. This prediction can help financial institutions manage risk more effectively, preventing potential losses.

# Data Description:
This study uses a binary target variable, default payment (Yes = 1, No = 0), to determine if a customer will default. The dataset contains 23 explanatory variables, which include:

* X1: Amount of given credit (NT dollar), covering both individual and family credit.
* X2: Gender (1 = Male, 2 = Female).
* X3: Education level (1 = Graduate school, 2 = University, 3 = High school, 4 = Others).
* X4: Marital status (1 = Married, 2 = Single, 3 = Others).
* X5: Age (in years).
* X6 to X11: History of past payments, from April to September 2005. These variables track repayment status (Repayment status in Moth, Year (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)).
* X12 to X17: Bill amounts for the corresponding months (April to September 2005).
* X18 to X23: Amounts of previous payments made in each respective month (April to September 2005).

# Objective:

The objective of this project is to develop a model that can predict which customers are at risk of defaulting in the coming months. Understanding credit card default is critical: when a customer defaults, it means they have become significantly delinquent in paying off their balance. Missing a few payments may not immediately count as a default, but failing to pay the minimum required amount for several consecutive months will result in a credit card default.

In [None]:
! ls -n credit_scoring_7904_Q4_2024/data_sets/credit_card/

# Importing necessary libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Load data
path = 'credit_scoring_7904_Q4_2024/data_sets/credit_card/balance.csv'
df_base = pd.read_csv(path)
df_base

# General summaries

* Displaying dataset info: Check basic dataset structure (30,000 rows, 24 columns) and data types, ensuring there are no missing values.
* Checking for null values: Confirm there are no missing values in any columns.
* Statistical summary: Provided a summary of key statistics for each column (mean, standard deviation, min/max values, etc.).

In [None]:
# Drop ID column as it's irrelevant for prediction
df = df_base.drop('ID', axis=1)

# Basic dataset info
#print(df.info())

# Checking for null values
print(df.isnull().sum())

# Statistical Summary
df.describe()


# Reordering and renaming columns

For better manipulation, let's rename some of the columns to reflect the month of the transaction.


In [None]:
# Renaming columns related to repayment status (X6 - X11) for the months from April to September 2005
df.rename(columns={
    'PAY_0': 'PAY_SEP_2005',
    'PAY_2': 'PAY_AUG_2005',
    'PAY_3': 'PAY_JUL_2005',
    'PAY_4': 'PAY_JUN_2005',
    'PAY_5': 'PAY_MAY_2005',
    'PAY_6': 'PAY_APR_2005'
}, inplace=True)

# Renaming columns related to bill amounts (X12 - X17) for the months from April to September 2005
df.rename(columns={
    'BILL_AMT1': 'BILL_AMT_SEP_2005',
    'BILL_AMT2': 'BILL_AMT_AUG_2005',
    'BILL_AMT3': 'BILL_AMT_JUL_2005',
    'BILL_AMT4': 'BILL_AMT_JUN_2005',
    'BILL_AMT5': 'BILL_AMT_MAY_2005',
    'BILL_AMT6': 'BILL_AMT_APR_2005'
}, inplace=True)

# Renaming columns related to previous payments (X18 - X23) for the months from April to September 2005
df.rename(columns={
    'PAY_AMT1': 'PAY_AMT_SEP_2005',
    'PAY_AMT2': 'PAY_AMT_AUG_2005',
    'PAY_AMT3': 'PAY_AMT_JUL_2005',
    'PAY_AMT4': 'PAY_AMT_JUN_2005',
    'PAY_AMT5': 'PAY_AMT_MAY_2005',
    'PAY_AMT6': 'PAY_AMT_APR_2005'
}, inplace=True)

# Checking the new column names
print(df.columns)


# Checking the Distribution of Default Status

To begin the analysis, it's important to examine the distribution of the target variable, which indicates whether clients are in default (1) or not (0). This step will provide an understanding of the balance between the two classes (default vs. non-default).

A heavily imbalanced dataset could affect the performance of machine learning models, potentially biasing them toward the majority class. Understanding this distribution helps in deciding whether techniques such as oversampling, undersampling, or adjusting class weights will be necessary.



In [None]:
# Checking the distribution of the target variable
plt.figure(figsize=(8,6))
ax = sns.countplot(x='default.payment.next.month', data=df)
plt.title('Default Payment Distribution')
plt.xlabel('Defaulter Status (1: Yes, 0: No)')
plt.ylabel('Number of Customers')

# Add labels on each bar
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='baseline', fontsize=11, color='black', xytext=(0, 5),
                textcoords='offset points')

plt.show()


# Correlation Analysis

A correlation heatmap helps visualize the relationships between features, allowing us to detect multicollinearity and identify key predictors. Features that are highly correlated with the target variable (default payment) are likely to be more predictive, while highly correlated features with each other may indicate redundancy. By extracting the features most correlated with the target, we can guide our feature selection process and enhance model performance.

In [None]:
# Correlation heatmap
plt.figure(figsize=(12,8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=False, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix of Features')
plt.show()

# Extracting highly correlated features
corr_with_target = corr_matrix['default.payment.next.month'].sort_values(ascending=False)
print("Correlations with Target Variable:\n", corr_with_target)

# Mapping Categorical Variables for Better Readability

**Purpose**: Mapping numerical categories (like SEX, EDUCATION, and MARRIAGE) to more meaningful and readable labels improves the interpretability of your data and makes visualizations and analysis more intuitive.

**Why it's useful**:

* Improved Readability: Rather than working with numeric codes, mapping to labels (e.g., 1 = Male, 2 = Female) makes the data easier to understand and interpret.
* Easier Interpretation for Stakeholders: When presenting your analysis or results to non-technical stakeholders, categorical labels are more meaningful than numeric codes.
* Prepares Data for Visualization: By converting numerical codes into readable labels, you can create more insightful and understandable visualizations.

In [None]:
# Mapping categorical variables for better readability
df['SEX'] = df['SEX'].map({1: 'Male', 2: 'Female'})
df['EDUCATION'] = df['EDUCATION'].map({1: 'Graduate School', 2: 'University', 3: 'High School', 4: 'Others'})
df['MARRIAGE'] = df['MARRIAGE'].map({1: 'Married', 2: 'Single', 3: 'Others'})

# Visualize categorical variables against the target variable
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

for col in categorical_features:
    plt.figure(figsize=(8,4))
    sns.countplot(x=col, hue='default.payment.next.month', data=df)
    plt.title(f'Default Payment vs {col}')
    plt.show()


# Let's plot some distributions of defaulters vs. Non. defaulters

## AGE


In [None]:
# Plot two histograms side by side (one for defaulters, one for non-defaulters)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Calculate common limits for both x (age) and y (frequency)
max_age = df['AGE'].max()
max_y_defaulters = df[df['default.payment.next.month'] == 1]['AGE'].value_counts().max()
max_y_non_defaulters = df[df['default.payment.next.month'] == 0]['AGE'].value_counts().max()
max_y = max(max_y_defaulters, max_y_non_defaulters)

# Age distribution for defaulters
sns.histplot(df[df['default.payment.next.month'] == 1]['AGE'], bins=40, kde=False, color='red', alpha=0.5, ax=axes[0])
axes[0].set_title('Age Distribution of Defaulters')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')
axes[0].set_xlim(20, max_age)
#axes[0].set_ylim(0, max_y*2.0)

# Age distribution for non-defaulters
sns.histplot(df[df['default.payment.next.month'] == 0]['AGE'], bins=40, kde=False, color='blue', alpha=0.5, ax=axes[1])
axes[1].set_title('Age Distribution of Non-Defaulters')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Frequency')
axes[1].set_xlim(20, max_age)
#axes[1].set_ylim(0, max_y*2.0)

# Display the plots
plt.tight_layout()
plt.show()


# LIMIT_BAL: Credit ammount

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
max_limit = max(df['LIMIT_BAL'])

# Credit Amount distribution for defaulters
sns.histplot(df[df['default.payment.next.month'] == 1]['LIMIT_BAL'], bins=40, kde=False, color='red', alpha=0.5, ax=axes[0])
axes[0].set_title('Credit Amount Distribution of Defaulters')
axes[0].set_xlabel('Credit Amount')
axes[0].set_ylabel('Frequency')
axes[0].set_xlim(0, max_limit)

# Credit Amount distribution for non-defaulters
sns.histplot(df[df['default.payment.next.month'] == 0]['LIMIT_BAL'], bins=40, kde=False, color='blue', alpha=0.5, ax=axes[1])
axes[1].set_title('Credit Amount of Non-Defaulters')
axes[1].set_xlabel('Credit Amount')
axes[1].set_ylabel('Frequency')
axes[1].set_xlim(0, max_limit)

# Display the plots
plt.tight_layout()
plt.show()

# Payment History vs. Default Status

Visualizing the payment history across multiple months helps identify patterns between repayment behavior and default risk. By comparing the distribution of defaulters and non-defaulters for different payment statuses (e.g., on-time, delayed), we can assess which past behaviors are strong indicators of default. This insight can guide feature selection and engineering for the predictive model.

Explanation from data:

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)


We observe undocumented values for repayment status variables: -2 and 0. Moreover, fraction of it is 86.5%. Strictly speaking, it is “NAs”.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of payment columns to plot
pay_cols_names = ['PAY_SEP_2005', 'PAY_AUG_2005', 'PAY_JUL_2005', 'PAY_JUN_2005', 'PAY_MAY_2005', 'PAY_APR_2005']

# Create a figure with a grid of subplots (2 rows, 3 columns for 6 plots)
fig, axes = plt.subplots(2, 3, figsize=(15, 8))  # Adjust the size as needed
axes = axes.flatten()  # Flatten to make iterating easier

# Loop over each payment column and corresponding axis
for i, col in enumerate(pay_cols_names):
    sns.countplot(x=col, data=df, ax=axes[i])
    axes[i].set_title(f'Repayment Status {col}')
    axes[i].set_xlabel('Repayment Status')
    axes[i].set_ylabel('Observations Count')

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of payment columns to plot
pay_cols_names = ['PAY_SEP_2005', 'PAY_AUG_2005', 'PAY_JUL_2005', 'PAY_JUN_2005', 'PAY_MAY_2005', 'PAY_APR_2005']

# Define a mapping for the payment status labels, excluding -2 and 0
payment_status_labels = {
    -1: 'Paid on Time',
    1: '1 month delay',
    2: '2 months delay',
    3: '3 months delay',
    4: '4 months delay',
    5: '5 months delay',
    6: '6 months delay',
    7: '7 months delay',
    8: '8+ months delay'
}

dff = df[df['default.payment.next.month']==0] # Play with this


# Create a figure with a grid of subplots (2 rows, 3 columns for 6 plots)
fig, axes = plt.subplots(2, 3, figsize=(15, 8))  # Adjust the size as needed
axes = axes.flatten()  # Flatten to make iterating easier

# Loop over each payment column and corresponding axis
for i, col in enumerate(pay_cols_names):
    # Filter out -2 (Paid on time) and 0 (No consumption)
    filtered_df = dff[dff[col].isin(payment_status_labels.keys())]

    # Plot the filtered data
    sns.countplot(x=col, data=filtered_df, ax=axes[i])

    # Set title and labels
    axes[i].set_title(f'Repayment Status {col}')
    axes[i].set_xlabel('Repayment Status')
    axes[i].set_ylabel('Observations Count')

    # Update the x-tick labels with the payment status descriptions
    axes[i].set_xticks([-1,1,2,3,4,5,6,7,8])  # Set ticks excluding -2 and 0
    axes[i].set_xticklabels([payment_status_labels.get(x, '') for x in [-1,1,2,3,4,5,6,7,8]], rotation=90)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
