# CS235 - Default of Credit Card Client Classification

Data source: https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients

Notebook Authors: 
- Dan O'Connor
- Shirin Afshar
- Alexander Hartley

In [23]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [24]:
# load in data
df = pd.read_csv('default of credit card clients.csv')

In [25]:
df = pd.read_csv('default of credit card clients.csv')
df = df.rename(columns={'PAY_0' : 'PAY_1', 'default payment next month' : 'DEFAULT_0'})

target_col = 'DEFAULT_0'
demographic_dis = ['SEX', 'EDUCATION', 'MARRIAGE']
demographic_con = ['AGE']
limit_cols = ['LIMIT_BAL']
pay_delay_cols = ['PAY_1', 'PAY_2','PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
bill_cols = ['BILL_AMT1', 'BILL_AMT2',  'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
payments_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

numeric_features = []
categorical_features = []
ordinal_features = []


In [26]:

df = df[df['MARRIAGE'] != 0]
df['EDUCATION'] = df['EDUCATION'].replace([0, 4, 5, 6], 4)

# replacing bill columns with binary columns
bill_amt_exceed = []
for col in bill_cols:
    exceed_col = col + 'Exceed'
    df = df[df[col] >= 0]
    df[col + 'Exceed'] = df[col] > df['LIMIT_BAL']
    df.drop(col, axis=1, inplace=True)
    bill_amt_exceed.append(exceed_col)
    
# log transform age, limit, and payment columns
df['AGE'] = np.log(df['AGE'])

df['LIMIT_BAL'] = np.log(df['LIMIT_BAL'])

for col in payments_cols:
    df[col] = np.log(df[col] + 1)

# setting max value as 6 for pay delay columns
for col in pay_delay_cols:
    df[col] = df[col].apply(lambda x: min(x, 6))

In [27]:
df[target_col].value_counts(normalize=True)

DEFAULT_0
0    0.774673
1    0.225327
Name: proportion, dtype: float64

In [28]:
y = df['DEFAULT_0']
X = df.drop(columns=['DEFAULT_0', 'ID'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22413, 23), (5604, 23), (22413,), (5604,))

In [29]:
numeric_features = payments_cols + demographic_con + limit_cols
categorical_features = demographic_dis
ordinal_features = pay_delay_cols


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features),
        # in case a value in train is not in test
        ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_features)
            ],
    remainder='passthrough' 
)

In [30]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [31]:
pca = PCA(n_components=0.95)  #keep 95% variance
pca.fit(X_train_scaled)  # fit on train

#tansform both training and test sets
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_scaled.shape , X_train_pca.shape

((22413, 29), (22413, 15))

In [32]:
feature_names_pca = [f'PCA{i}' for i in range(1, 16)]
X_train_pca_df = pd.DataFrame(X_train_pca, columns=feature_names_pca)
X_test_pca_df = pd.DataFrame(X_test_pca, columns=feature_names_pca)

X_train_pca_df.to_csv('X_train_pca.csv', index=False)
X_test_pca_df.to_csv('X_test_pca.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)