In [1]:
# Import Python Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Import SMOTE Related Packages
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [46]:
train_df = pd.read_csv('./data/clean_train.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,month,age,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,...,occupation_Lawyer,occupation_Manager,occupation_Mechanic,occupation_Media_Manager,occupation_Musician,occupation_Scientist,occupation_Teacher,occupation_Writer,spending_level,payment_size
0,0,0,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3,...,False,False,False,False,False,True,False,False,2,0
1,1,1,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,-1,...,False,False,False,False,False,True,False,False,0,2
2,2,2,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3,...,False,False,False,False,False,True,False,False,0,1
3,3,3,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,5,...,False,False,False,False,False,True,False,False,0,0
4,4,4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6,...,False,False,False,False,False,True,False,False,2,1


In [47]:
plt.figure(figsize=(12, 5))
sns.countplot(x='credit_score', data=train_df)
plt.xticks(ticks=[0,1,2], labels=['Poor', 'Standard', 'Good'])
plt.title('Original Class Distribution')
plt.savefig('./image/original_class_distribution.png')
plt.close()

SMOTE Balancing

In [48]:
# extract X and y
X = train_df.drop('credit_score', axis=1)
y = train_df['credit_score']

# SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [49]:
pd.Series(y_resampled).value_counts()

credit_score
2.0    46707
1.0    46707
0.0    46707
Name: count, dtype: int64

In [50]:
plt.figure(figsize=(12, 5))
sns.countplot(x=y_resampled)
plt.xticks(ticks=[0,1,2], labels=['Poor', 'Standard', 'Good'])
plt.title('SMOTE Balanced Class Distribution')
plt.savefig('./image/smote_class_distribution.png')
plt.close()

In [51]:
plt.figure(figsize=(12, 4))
# Before SMOTE
plt.subplot(1, 2, 1)
sns.countplot(x='credit_score', data=train_df)
plt.xticks(ticks=[0,1,2], labels=['Poor', 'Standard', 'Good'])
plt.title('Distribution before SMOTE')
# After SMOTE
plt.subplot(1, 2, 2)
sns.countplot(x=y_resampled)
plt.xticks(ticks=[0,1,2], labels=['Poor', 'Standard', 'Good'])
plt.title('Distribution after SMOTE')
# Show the plot
plt.tight_layout()
plt.savefig('./image/class_distribution_comparison.png')
plt.close()