In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())
print('Percent fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))

0    284315
1       492
Name: Class, dtype: int64
Percent fraud: 0.17304750013189596%


## Data Cleaning

- Dataset is huge and very imbalanced
- Take a subset of data, keep ratio intact
- Components are already principle components, perform some feature selection
- Address imbalance with under/oversampling techniques

### Sample dataset

In [3]:
#sample 10% of the full dataset
#keep the fraud ratio close using random state
df_sample = df_raw.sample(frac=0.1, replace=True, random_state=6)
print(df_sample.Class.value_counts())
print('\nPercent Fraud:')
print('\nSample df: {}%'.format(
    ((df_sample['Class']==1).sum() / (df_sample['Class']==0).sum())*100))
print('\nFull df: {}%'.format(
    ((df_raw['Class']==1).sum() / (df_raw['Class']==0).sum())*100))

y_sample = df_sample['Class'] #target
X_sample = df_sample.loc[:, ~df_sample.columns.isin(['Class'])] #data

0    28432
1       49
Name: Class, dtype: int64

Percent Fraud:

Sample df: 0.17234102419808667%

Full df: 0.17304750013189596%


### Feature selection

In [22]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

k=3
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X_sample, y_sample)
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X_sample.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df_sample[k_features]

3 kbest features:
['V12', 'V14', 'V17']


In [23]:
from sklearn.model_selection import train_test_split

#random state to keep ratio intact
X_ktrain, X_ktest, y_train, y_test = train_test_split(X_kbest,
                                                      y_sample,
                                                      test_size=.2,
                                                      random_state=112)
#print(y_train.value_counts())
#print(y_test.value_counts())

### Class Imbalance
SKLearn's undersample removes a lot of data, try another method

__imblearn random sampling:__
- Cluster the records of the majority class
- Under-sample: remove records from each cluster, thus seeking to preserve information
- Over-sample: instead of creating exact copies of the minority class records, this introduces small variations into those copies, creating more diverse synthetic samples

In [24]:
#Oversample train sets to retain data
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_ros_train, y_ros_train = ros.fit_sample(X_ktrain, y_train)
print(len(X_ros_train), len(y_ros_train))

45490 45490


#### Final train & test sets

In [25]:
X_train = X_ros_train

X_test = X_ktest

y_train = y_ros_train

y_test = y_test

## Gradient Boosting Classifier

In [26]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
#parameter search

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 250}

Best score:
 0.9998021543196307

runtime:
 203.814886 seconds


¿ GridSearchCV often finds different parameters each time I run it, how should I handle this ?

In [27]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

cv = 10
clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=250,
                                           max_depth=2)

start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=cv)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99978022 1.         0.99978022 0.99978022 0.99978022 1.
 0.99956025 0.99956025 0.99978012 0.99978012]

score array mean:
 0.9997801617907159

AUROC score:
 1.0

runtime:
 18.40108299999997 seconds


In [28]:
#run on test set
start_time = time.clock()
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=cv)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 1.         0.99649123 0.99824561 0.99824561 1.
 1.         1.         0.99824253 0.99824253]

score array mean:
 0.9987713131686864

AUROC score:
 0.9516441005802708

runtime:
 1.2873839999999745 seconds


### Apply subsample parameter

In [30]:
params = [{'loss':['exponential'],
           'learning_rate':[1],
           'n_estimators':[250],
           'max_depth':[2],
           'subsample':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

clf3 = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf3, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 250, 'subsample': 0.2}

Best score:
 0.9997362057595076

runtime:
 46.85390200000006 seconds


In [31]:
clf4 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=250,
                                           max_depth=2,
                                           subsample=0.2)
start_time = time.clock()
clf4.fit(X_train, y_train)
scores_clf4 = cross_val_score(clf4, X_train, y_train, cv=cv)

#AUROC score
prob_y = clf4.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf4)
print('\nscore array mean:\n', np.mean(scores_clf4))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99978022 0.99978022 0.99978022 1.         1.         1.
 0.99956025 0.99978012 0.99956025 0.99978012]

score array mean:
 0.999802139812694

AUROC score:
 1.0

runtime:
 18.876193000000058 seconds


In [32]:
#run on test set
start_time = time.clock()
scores_clf4_test = cross_val_score(clf4, X_test, y_test, cv=cv)
prob_y_test = clf4.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf4_test)
print('\nscore array mean:\n', np.mean(scores_clf4_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 1.         0.99473684 0.99824561 1.         1.
 0.99824561 0.99824253 0.99824253 0.99824253]

score array mean:
 0.9984201276477662

AUROC score:
 0.9262528573940566

runtime:
 2.3189390000000003 seconds


### Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

params_lr = [{'penalty':['l1','l2'],
           'C':[0.01, 0.1, 1, 10],
           'fit_intercept':['True','False']}]

lr = LogisticRegression()
grid = GridSearchCV(estimator=lr, param_grid=params_lr)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'C': 0.01, 'fit_intercept': 'True', 'penalty': 'l2'}

Best score:
 0.8474609804352605

runtime:
 1.3314260000000786 seconds


In [34]:
#train
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=.01, penalty='l2', fit_intercept=True)

start_time = time.clock()
lr2.fit(X_train, y_train)
scores_lr2 = cross_val_score(lr2, X_train, y_train, cv=cv)
prob_y = lr2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr2)
print('\nscore array mean:\n', np.mean(scores_lr2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.84373626 0.84131868 0.85010989 0.84879121 0.85494505 0.85070361
 0.85004398 0.84278804 0.84256816 0.84564644]

score array mean:
 0.8470651318777968

AUROC score:
 0.9153448193460321

runtime:
 0.439918999999918 seconds


In [35]:
#test
start_time = time.clock()
scores_lr2_test = cross_val_score(lr2, X_test, y_test, cv=cv)
prob_y_test = lr2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_lr2_test)
print('\nscore array mean:\n', np.mean(scores_lr2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 0.99824561 0.99824561 1.         1.
 1.         0.99824253 1.         1.        ]

score array mean:
 0.9991224986896062

AUROC score:
 0.9920344645683137

runtime:
 0.06246600000008584 seconds


### Ridge Classifier

In [36]:
from sklearn.linear_model import RidgeClassifier

params_rclf = [{'alpha':[0.01, 0.1, 1, 10],
              'fit_intercept':['True','False']}]

rclf = RidgeClassifier()
grid = GridSearchCV(estimator=rclf, param_grid=params_rclf)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'alpha': 0.01, 'fit_intercept': 'True'}

Best score:
 0.8585183556825676

runtime:
 0.19184200000006513 seconds


In [37]:
#train
rclf2 = RidgeClassifier(alpha=.01, fit_intercept=True)
rclf2.fit(X_train,y_train)

scores_rclf2 = cross_val_score(rclf2, X_train, y_train, cv=cv)
#prob_y = rclf.predict_proba(X_ktrain)
#prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_rclf2)
print('\nscore array mean:\n', np.mean(scores_rclf2))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.85538462 0.85406593 0.86197802 0.85692308 0.86571429 0.86323659
 0.86609499 0.8526825  0.84938434 0.85861917]

score array mean:
 0.8584083524215449

runtime:
 0.29351500000007036 seconds


In [38]:
#test
start_time = time.clock()
scores_rclf2_test = cross_val_score(rclf2, X_test, y_test, cv=cv)

print('score array:\n', scores_rclf2_test)
print('\nscore array mean:\n', np.mean(scores_rclf2_test))
#print('\nAUROC score:\n', roc_auc_score(y_test, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 0.99824561 0.99824561 0.99824561 1.
 0.99824561 0.99824253 0.99824253 0.99824253]

score array mean:
 0.9984201276477662

runtime:
 0.03832899999997608 seconds
