# [UCI ML Repo - credit card defaults](http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#)
## Training
### Platform: Python 3, colab.research.google.com

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import dump, load
from google.colab import drive
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

## Load data

In [2]:
drive.mount('/content/gdrive', force_remount=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
loc = "/content/gdrive/My Drive/Colab Notebooks/uci-credit-card-defaults/data/defaults_clean.csv"
data = pd.read_csv(loc, header=0)
data.shape

(30000, 25)

In [4]:
data.head(5)

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [5]:
data.tail(5)

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1
29999,30000,50000,1,2,1,46,0,0,0,0,...,36535,32428,15313,2078,1800,1430,1000,1000,1000,1


In [6]:
type_dict = {}
for i in ["limit_bal", "bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4",
         "bill_amt5", "bill_amt6", "pay_amt1", "pay_amt2", "pay_amt3", 
         "pay_amt4", "pay_amt5", "pay_amt6"]:
    if i in data.columns:
        type_dict[i] = "float64"
data = data.astype(type_dict)
data.dtypes

id             int64
limit_bal    float64
sex            int64
education      int64
marriage       int64
age            int64
pay_0          int64
pay_2          int64
pay_3          int64
pay_4          int64
pay_5          int64
pay_6          int64
bill_amt1    float64
bill_amt2    float64
bill_amt3    float64
bill_amt4    float64
bill_amt5    float64
bill_amt6    float64
pay_amt1     float64
pay_amt2     float64
pay_amt3     float64
pay_amt4     float64
pay_amt5     float64
pay_amt6     float64
default        int64
dtype: object

In [7]:
data.describe()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.842267,1.557267,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.744494,0.521405,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,1.0,1.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,4.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


## Training

In [0]:
def view_metrics(y_test, y_pred):
    print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
    print("Precision: {}".format(precision_score(y_test, y_pred)))
    print("Recall: {}".format(recall_score(y_test, y_pred)))
    print("F1: {}".format(f1_score(y_test, y_pred)))

In [9]:
# split train/test
y = data.loc[:, "default"]
X = data.drop(["id", "default"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)
loc_data = "/content/gdrive/My Drive/Colab Notebooks/uci-credit-card-defaults/data"
# save test file for evaluation
X_test.to_csv("{}/defaults_clean_test_X.csv".format(loc_data), index=False)
y_test.to_csv("{}/defaults_clean_test_y.csv".format(loc_data), index=False)
X_train.head(5)

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6
28465,240000.0,2,1,1,40,-2,-2,-2,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27622,50000.0,2,1,2,23,-1,-1,-1,-1,-1,...,2299.0,4800.0,9810.0,660.0,2548.0,2321.0,4800.0,9810.0,660.0,2980.0
28376,50000.0,2,2,1,36,2,2,2,2,0,...,49125.0,47956.0,43578.0,35126.0,0.0,4700.0,0.0,2004.0,3500.0,0.0
10917,200000.0,2,3,1,54,6,5,4,3,2,...,104686.0,102549.0,101400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27234,240000.0,1,1,1,35,-1,-1,-1,0,-1,...,21790.0,17102.0,13367.0,22659.0,2017.0,21817.0,1120.0,13434.0,22772.0,22820.0


In [0]:
transformed_features = None

## Tranformation pipeline classes

In [0]:
class StdCol():
    """
    Standardizes column 'col_name' in a pipeline
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X.loc[:, self.col_name].values.reshape(-1, 1))
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        X.loc[:, self.col_name] = self.ss.transform(X.loc[:, self.col_name].values.reshape(-1, 1))
        global transformed_features
        transformed_features = X
        return X

In [0]:
class LogCol():
    """
    Log transforms column 'col_name' in a pipeline
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        X.loc[X[self.col_name]<1, self.col_name] = 1
        X.loc[:, self.col_name] = np.log(X.loc[:, self.col_name])
        global transformed_features
        transformed_features = X
        return X

In [0]:
class CategoricalColInt():
    """
    Tranforms column 'col_name' into n-1 categorical columns
    """
    def __init__(self, col_name):
        self.col_name = col_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        dummies = pd.get_dummies(X.loc[:, self.col_name], prefix=self.col_name)
        dummies_cols = list(dummies.columns)[1:] # drop last new category to avoid feature correlation
        X = X.merge(dummies.loc[:,dummies_cols], left_index=True, right_index=True)
        X = X.drop(columns=self.col_name, axis=1)
        global transformed_features
        transformed_features = X
        return X

In [0]:
class AveColumns():
    """
    Calc average of columns
    """
    def __init__(self, feature_name, cols):
        self.feature_name = feature_name
        self.cols = cols
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        X.loc[:, self.feature_name] = X.loc[:, self.cols].mean(axis=1)
        global transformed_features
        transformed_features = X
        return X

In [0]:
class StDevColumns():
    """
    Calc stardard deviation of columns cols
    """
    def __init__(self, feature_name, cols):
        self.feature_name = feature_name
        self.cols = cols
        pass
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        X.loc[:, self.feature_name] = X.loc[:, self.cols].std(axis=1)
        global transformed_features
        transformed_features = X
        return X

In [0]:
class SelectKBestFeatures():
    """
    Selects num of features to K_best
    """
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.selector = SelectKBest(k=20).fit(X, y)
        return self

    def transform(self, X):
        X = X.copy(deep=True)
        return self.selector.transform(X)

## Train pipeline

In [17]:
pipe_list = []
ave_cols = {"avg_bill": ["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6"],
            "avg_pay": ["pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]}
st_dev_cols = {"st_dev_bill": ["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6"],
            "st_dev_pay": ["pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]}
categorical_cols = ["sex", "education", "marriage"]
log_cols = ["limit_bal", "avg_bill", "avg_pay",
            "bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6", 
            "pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]
standardize_cols = ["age"]
for i in ave_cols:
    pipe_list.append(("enc_ave_"+i, AveColumns(i, ave_cols[i])))
for i in st_dev_cols:
    pipe_list.append(("enc_stdev_"+i, StDevColumns(i, st_dev_cols[i])))
for i in categorical_cols:
    if i in X_train.columns:
        pipe_list.append(("enc_cat_"+i, CategoricalColInt(col_name=i)))
for i in log_cols:
    if i in X_train.columns:
        pipe_list.append(("enc_log_"+i, LogCol(col_name=i)))
for i in standardize_cols:
    if i in X_train.columns:
        pipe_list.append(("enc_std_"+i, StdCol(col_name=i)))
pipe_list.append(("k_best_selector", SelectKBestFeatures()))
pipe_list.append(("model", LogisticRegression()))
pipeline = Pipeline(pipe_list)
model = pipeline.fit(X_train, y_train)
transformed_features.columns



Index(['limit_bal', 'age', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5',
       'pay_6', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4',
       'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3',
       'pay_amt4', 'pay_amt5', 'pay_amt6', 'avg_bill', 'avg_pay',
       'st_dev_bill', 'st_dev_pay', 'sex_2', 'education_2', 'education_3',
       'education_4', 'marriage_2', 'marriage_3'],
      dtype='object')

In [18]:
y_pred = model.predict(X_train)
view_metrics(y_train, y_pred)

Accuracy: 0.8031428571428572
Precision: 0.6782894736842106
Recall: 0.22048759623609923
F1: 0.3327953518398967




In [0]:
# parameters = {
#     "model__penalty": ["l2"],
#     "model__solver": ["lbfgs", "liblinear"],
#     "model__max_iter": [50, 100], 
#     "model__C": [0.7, 0.3, 0.1]}
# grid = GridSearchCV(pipeline, parameters, cv=4, scoring="f1")
# grid.fit(X_train, y_train)
# print(grid.best_params_)
# y_pred = grid.predict(X_train)
# view_metrics(y_train, y_pred)

In [0]:
# print("y_train")
# print(np.array(y_train)[:200])
# print("y_pred")
# print(y_pred[:200])

In [0]:
# loc_model = "/content/gdrive/My Drive/Colab Notebooks/uci-credit-card-defaults"
# dump(grid.best_estimator_, "{}/model.joblib".format(loc_model)) 