In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

sns.set_style('whitegrid')

# Problem definition

Apply regression models to predict the house pricing

# Load the data

In [None]:
#source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
df_original = pd.read_csv('data/default of credit card clients.csv')
df = df_original.copy()
print(df.columns)
df.head()

In [None]:
# TODO: check for NaNs


# Feature Engineering 

In [None]:
# TODO: remove a confusing column


In [None]:
# TODO: remove line with PAY_* < -1


In [None]:
# TODO: create a loop to transform the categorical columns to numerical
# for col in ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
#     df_dummies = pd.get_dummies(df[col])
#     df_dummies.columns = [str(col)+'_'+str(c) for c in df_dummies.columns]
#     df = pd.concat([df, df_dummies], axis=1)
#     # Remove the original columns
#     del df[col]
# df.head()

In [None]:
# feature engineering
X_columns = [x for x in df.columns if x != 'default payment next month']
y_column = ['default payment next month']

In [None]:
list(X_columns)

# Model Training

In [None]:
# split the data
threshold = 0.8
X = df[X_columns]
y = df[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True, random_state=42)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

# Model Training / Evaluation - Using Split

In [None]:
# TODO: try different models
models = [
    ('KNeighborsClassifier', KNeighborsClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
#     # if there is a feature importance, print top 10
#     importance = []
#     if hasattr(model, 'feature_importances_'):
#         print('Feature Importance')
#         importance = []
#         for i in range(len(X_columns)):
#             importance.append([X_columns[i], model.feature_importances_[i]])
#         print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
#     elif hasattr(model, 'coef_'):
#         print('Feature Importance')
#         for i in range(len(X_columns)):
#             importance.append([X_columns[i], model.coef_[i]])
#         print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
        
#     print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

# Model Training / Evaluation - Cross Validation

In [None]:
k = 10
results = {}
for m in models:
    print('MODEL', m[0])
    results[m[0]] = {'precision':[], 'recall':[]}
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        model = m[1]
        model.fit(X_train, y_train.ravel())
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        results[m[0]]['precision'].append(precision)
        results[m[0]]['recall'].append(recall)
for metric in ['precision', 'recall']:
    values = []
    labels = []
    for model, result_values in results.items():
        for m, v in result_values.items():
            if m == metric:
                labels.append(model)
                values.append(v)
    plt.figure(figsize=(12,6))
    plt.title(metric)
    plt.boxplot(values)
    plt.xticks(range(1, len(labels)+1), labels, rotation='horizontal')
    plt.show()

# Tuning the Thresholds

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train.values.ravel())
for i in range(1,10):
    print(i)
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred = [1 if x > i/10.0 else 0 for x in y_pred]
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)