# Assignment 12
## Applied Machine Learning

Andrew Chan 
EBE869

Credit card fraud costs about 1% to the banks, an amount which customers (us) eventually
pay. Let's find those anomalies which might reveal fraud. Download the popular credit card
dataset from Kaggle.

# 1. [10 pts] Pre-process the dataset, and then apply normalization or standardization, list number of rows and columns, check sanity

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np

# Locate and load the data file
df = pd.read_csv('creditcard.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns.values

In [None]:
df.dtypes

## Mean Imputation

In [None]:
df = df.fillna(df.mean())

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
columns_without_target = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27',
       'V28', 'Amount']
df[columns_without_target] = scaler.fit_transform(df[columns_without_target])

## Correlation for feature reduction

In [None]:
df.corr().sort_values(by='Class', ascending=False)['Class']

## Selecting highly correlated features with 'Class'


|Feature|Corr|
-|-
V11    |     0.154876  
V4     |     0.133447  
V1     |   -0.101347  
V18    |  -0.111485  
V7     |    -0.187257  
V3     |    -0.192961  
V16    |    -0.196539  
V10    |    -0.216883  
V12    |    -0.260593  
V14    |    -0.302544  
V17    |    -0.326481  

In [None]:
# selected_features_with_label = ['V11','V4','V1','V18','V7','V3','V16','V10','V12','V14','V17','Class']
selected_features_with_label = ['V10','V12','V14','V17','Class']

In [None]:
df = df[selected_features_with_label]

## Sanity check number of rows and columns

In [None]:
# Sanity check
print(f'N rows={len(df)}, M columns={len(df.columns)}')
df.head()

# 2. [10 pts] Check the class balance and pick an evaluation metric.

### Class Balance

In [None]:
df['Class'].value_counts()

### Evaluation Metric

Since the class balance is heavily skewed towards no fraud, we should use `Recall` (`true positive rate`) since we'd rather error on the side of caution and label fraud versus not. If we miss fraud we could potentially lose a lot of money if a customer cancels their credit card with us:

$RECALL = TPR = \frac{TP}{P} = \frac{TP}{FN + TP}$

# 3. [20 pts] Split the dataset 50-50 for training and testing. 
Then run DecisionTreeClassifier,
SVC, MLPClassifier without any tree pruning or regularization. Report the classification
performance.

In [None]:
X = df.drop(['Class'], axis=1).values
y = df['Class'].values

## 50-50 split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=0)

## DecisionTreeClassifier

In [None]:
from sklearn import tree

clf1 = tree.DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

fig = plt.subplots(nrows=1, ncols=1,dpi=600)
tree.plot_tree(clf1,
              class_names=list(map(lambda x: str(x),df['Class'].unique())),
              filled = True)

plt.show()

In [None]:
y_pred = clf1.predict(X_test)
print('RECALL SCORE:',metrics.recall_score(y_test,y_pred))

## SVC

In [None]:
from sklearn import svm
clf_rbf_svc = svm.SVC(kernel='rbf').fit(X_train, y_train)
y_pred = clf_rbf_svc.predict(X_test)
print('RECALL SCORE:',metrics.recall_score(y_test,y_pred))

## MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
mlp1 = MLPClassifier(hidden_layer_sizes=(10,), alpha=0, random_state=None, max_iter=10000).fit(X_train,y_train)
y_pred = mlp1.predict(X_test)
print('RECALL SCORE:',metrics.recall_score(y_test,y_pred))

# 4. [20 pts] Run DecisionTreeClassifier, SVC, MLPClassifier with tree pruning and regularization 
(Hint: Use GridSearchCV to optimize the regularization parameters). Report
the classification performance.

## Decision Tree

In [None]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV

parameters = {'ccp_alpha':[1e-12, 1e-9, 1e-6, 1e-3]}
clf_dt = tree.DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(clf_dt, parameters, scoring = 'recall')
clf.fit(X_train, y_train)

fig = plt.subplots(nrows=1, ncols=1,dpi=600)
tree.plot_tree(clf.best_estimator_,
              class_names=list(map(lambda x: str(x),df['Class'].unique())),
              filled = True)
plt.show()

In [None]:
pd.DataFrame(clf.cv_results_)

In [None]:
y_pred = clf.best_estimator_.predict(X_test)
dt_recall_50 = metrics.recall_score(y_test,y_pred)
print('RECALL SCORE:', dt_recall_50)

## SVC

In [None]:
parameters = {'kernel':['rbf'], 'C':[1e-3,10,1e3]}
clf_rbf_svc = GridSearchCV(svm.SVC(), parameters, scoring = 'recall')
clf_rbf_svc.fit(X_train, y_train)
y_pred = clf_rbf_svc.best_estimator_.predict(X_test)

In [None]:
pd.DataFrame(clf_rbf_svc.cv_results_)

In [None]:
clf_rbf_svc.best_params_

In [None]:
pd.DataFrame(clf_rbf_svc.cv_results_)
svc_recall_50 = metrics.recall_score(y_test,y_pred)
print('RECALL SCORE:',svc_recall_50)

## MLP

In [None]:
parameters = {'alpha':[1e-6,1e-3,1]}
clf_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10,), random_state=None, max_iter=10000), parameters, scoring = 'recall')
clf_mlp.fit(X_train, y_train)
y_pred = clf_mlp.best_estimator_.predict(X_test)

In [None]:
pd.DataFrame(clf_mlp.cv_results_)

In [None]:
clf_mlp.best_params_

In [None]:
pd.DataFrame(clf_mlp.cv_results_)
mlp_recall_50 = metrics.recall_score(y_test,y_pred)
print('RECALL SCORE:',mlp_recall_50)

# 5. [20 pts] Attempt avoiding overfitting while the training is reduced. 
Add splits 40-60, 30-70, 20-80, 10-90, 5-95 and repeat step 3 and step 4.

Let's make a function that will run through all three: DecisionTreeClassifier, SVC, MLPClassifier so we avoid repeated code and bugs:

In [None]:

from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import tree
from sklearn import svm
from sklearn.neural_network import MLPClassifier


def trainSweep3Models(test_per, X,y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_per, random_state=0)
    
    ###################
    ## Decision Tree ##
    ###################
    parameters = {'ccp_alpha':[1e-12, 1e-9, 1e-6, 1e-3]}
    clf_dt = GridSearchCV(tree.DecisionTreeClassifier(random_state=0), parameters, scoring = 'recall')
    clf_dt.fit(X_train, y_train)
    fig = plt.subplots(nrows=1, ncols=1,dpi=600)
    tree.plot_tree(clf_dt.best_estimator_,
                class_names=list(map(lambda x: str(x), df['Class'].unique())),
                filled = True)
    plt.show()
    y_pred = clf_dt.best_estimator_.predict(X_test)
    dt_recall = metrics.recall_score(y_test,y_pred)
    print(pd.DataFrame(clf_dt.cv_results_))
    print('Decision Tree RECALL SCORE:', dt_recall)
    print()

    #########
    ## SVC ##
    #########

    parameters = {'kernel':['rbf'], 'C':[1e-3,10,1e3]}
    clf_svc = GridSearchCV(svm.SVC(), parameters, scoring = 'recall')
    clf_svc.fit(X_train, y_train)
    y_pred =    clf_svc.best_estimator_.predict(X_test)
    svc_recall = metrics.recall_score(y_test,y_pred)
    print(pd.DataFrame(clf_svc.cv_results_))
    print('SVC RECALL SCORE:',svc_recall)
    print()

    #########
    ## MLP ##
    #########

    parameters = {'alpha':[1e-12,1e-9,1e-6,1e-3,1]}
    clf_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10,), random_state=None, max_iter=10000), parameters, scoring = 'recall')
    clf_mlp.fit(X_train, y_train)
    y_pred = clf_mlp.best_estimator_.predict(X_test)
    mlp_recall = metrics.recall_score(y_test,y_pred)
    print(pd.DataFrame(clf_mlp.cv_results_))
    print('MLP RECALL SCORE:',mlp_recall)
    print()

    return dt_recall, svc_recall, mlp_recall, clf_dt, clf_svc, clf_mlp

# 40-60
---

In [None]:
dt_recall_40, svc_recall_40, mlp_recall_40, clf_dt_40, clf_svc_40, clf_mlp_40 = trainSweep3Models(0.6,X,y)

## 30-70

In [None]:
%%time
dt_recall_30, svc_recall_30, mlp_recall_30, clf_dt_30, clf_svc_30, clf_mlp_30 = trainSweep3Models(0.7,X,y)

## 20-80

In [None]:
%%time
dt_recall_20, svc_recall_20, mlp_recall_20, clf_dt_20, clf_svc_20, clf_mlp_20 = trainSweep3Models(0.8,X,y)

## 10-90

In [None]:
%%time
dt_recall_10, svc_recall_10, mlp_recall_10, clf_dt_10, clf_svc_10, clf_mlp_10 = trainSweep3Models(0.9,X,y)

## 5-95

In [None]:
%%time
dt_recall_5, svc_recall_5, mlp_recall_5, clf_dt_5, clf_svc_5, clf_mlp_5 = trainSweep3Models(0.95,X,y)

# 6. [20 pts] Plot everything you have on a single plot and comment about your results in terms of training size, regularization, etc.

In [None]:
trainingSize = [0.5,0.4,0.3,0.2,0.1,0.05]
recall_dt = [dt_recall_50, dt_recall_40,dt_recall_30,dt_recall_20,dt_recall_10,dt_recall_5]
recall_svc = [svc_recall_50, svc_recall_40,svc_recall_30,svc_recall_20,svc_recall_10,svc_recall_5]
recall_mlp =  [mlp_recall_50,mlp_recall_40,mlp_recall_30,mlp_recall_20,mlp_recall_10,mlp_recall_5]
# Plot
plt.figure(figsize=(15,10))
plt.plot(trainingSize, recall_dt, ':', color='red', label='Decision Tree')
plt.plot(trainingSize, recall_svc, ':', color='green', label='SVC')
plt.plot(trainingSize, recall_mlp, ':', color='blue', label='MLP')

# Labels
plt.title('Recall versus training size')
plt.xlabel('Training Percent')
plt.ylabel('Recall')
plt.legend(loc='lower right')
plt.grid()
plt.show()