# Imbalanced Lab on the Caravan Dataset


In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC

import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(8)



# Load the dataset and shuffle

In [3]:
df = pd.read_csv('../../datasets/Caravan.csv', index_col = False)

# Shuffles your dataframe in-place and resets the index
df = df.sample(frac=1).reset_index(drop=True)

y = df.Purchase.factorize()[0]
X = df.drop(['Purchase'], axis=1).values


# Dumb classifier
The classifier always predicts zeroes. What's the accuracy?

        np.zeroes


In [5]:
y_hat = np.zeros(len(y))

print(accuracy_score(y_hat,y))


0.940226726211


# Simple Support Vector Machine

* Split the data into train and test (80/20) with train_test_split
* Using GridSearchCV train a SVC with params:
       * parameters = {'C': [0.001, 0.01,0.1, 0.5,1]}
       * 5 K-fold
       * auc scoring
       * rbf kernel
       
See [SVC in scikit](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

Print out

* Grid scores
* Accuracy
* confusion matrix 
* roc auc score


In [7]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)
# => 273 / 75 Yes

parameters = {'C': [0.001, 0.01,0.1, 0.5, 1]}

clf = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, scoring='roc_auc')
clf.fit(X_train,y_train)
# confusion matrix of best model


y_hat = clf.predict(X_test)

print( accuracy_score(y_hat,y_test))
# 94% accuracy!!!

# print(classification_report(y_hat, y_test))
print( confusion_matrix(y_hat,y_test))
clf.grid_scores_

0.947639484979
[[1104   61]
 [   0    0]]


[mean: 0.57257, std: 0.02661, params: {'C': 0.001},
 mean: 0.63379, std: 0.03615, params: {'C': 0.01},
 mean: 0.64054, std: 0.03854, params: {'C': 0.1},
 mean: 0.64114, std: 0.03898, params: {'C': 0.5},
 mean: 0.64057, std: 0.03923, params: {'C': 1}]

# Under sample

On the original dataset, select all the 348 Yes samples and select a random sample of 348 No samples. 

Build a dataframe under_df composed of 348 yes samples and 348 No samples. (Using df.append)

Shuffle, split the under_df into train and test (80/20) and carry out the same grid search as above.

* Grid scores
* Accuracy
* confusion matrix 
* roc auc score



In [11]:
under_df = df[df.Purchase == 'Yes']
under_df = under_df.append(df[df.Purchase == 'No'].sample(348)).sample(frac=1).reset_index(drop=True)

# under_df has 696 samples with 50/50 yes no
y = under_df.Purchase.factorize()[0]
X = under_df.drop(['Purchase'], axis=1).values

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)
parameters = {'C': [0.001, 0.01,0.1, 0.5,1]}

clf = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, scoring='roc_auc')
clf.fit(X_train,y_train)
# confusion matrix of best model


y_hat = clf.predict(X_test)

print( accuracy_score(y_hat,y_test))
print( confusion_matrix(y_hat,y_test))
print( classification_report(y_hat,y_test))


clf.grid_scores_

0.628571428571
[[46 34]
 [18 42]]
             precision    recall  f1-score   support

          0       0.72      0.57      0.64        80
          1       0.55      0.70      0.62        60

avg / total       0.65      0.63      0.63       140



[mean: 0.67883, std: 0.03527, params: {'C': 0.001},
 mean: 0.67844, std: 0.03534, params: {'C': 0.01},
 mean: 0.67831, std: 0.03510, params: {'C': 0.1},
 mean: 0.70307, std: 0.04951, params: {'C': 0.5},
 mean: 0.70453, std: 0.05615, params: {'C': 1}]

# Over sample

This time build an over_df dataframe by replicating 4 times the original Yes samples and 348 * 5 No Samples

The over_df dataframe should have a total of 3480 samples

Shuffle, split the under_df into train and test (80/20) and carry out the same grid search as above.

* Grid scores
* Accuracy
* confusion matrix 
* roc auc score



In [12]:
df = pd.read_csv('../../datasets/Caravan.csv', index_col = False)

# let's replicate the number of yes cases by 6
tmp = df[df.Purchase == 'Yes']
for _ in range(4):
    tmp = tmp.append( df[df.Purchase == 'Yes']  )

df = df.append(tmp).sample(frac=1).reset_index(drop=True)
print(df.Purchase.value_counts())
# we now have 5474 / 2088

y = df.Purchase.factorize()[0]
X = df.drop(['Purchase'], axis=1).values
scale = StandardScaler()
X = scale.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

parameters = {'C': [0.001, 0.01,0.1, 0.5,1]}

clf = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, scoring='roc_auc')
clf.fit(X_train,y_train)
clf.grid_scores_

y_hat = clf.predict(X_test)
print( accuracy_score(y_hat,y_test))
# 83% accuracy!!!

print(classification_report(y_hat, y_test))
print( confusion_matrix(y_hat,y_test))



No     5474
Yes    2088
Name: Purchase, dtype: int64




0.844018506279
             precision    recall  f1-score   support

          0       0.94      0.85      0.90      1201
          1       0.59      0.81      0.68       312

avg / total       0.87      0.84      0.85      1513

[[1025  176]
 [  60  252]]


# SMOTE

Reload the original dataset, split train/ test

Generate new SMOTE data

    from imblearn.over_sampling import SMOTE
    smo_X, smo_y = smote.fit_sample(X_train, y_train)

GridsearchCV with the same parameters as before on the  smo_X, smo_y sets.

Conclusion?


In [15]:
df = pd.read_csv('../../datasets/Caravan.csv', index_col = False)

df = df.sample(frac=1).reset_index(drop=True)
y = df.Purchase.factorize()[0]
X = df.drop(['Purchase'], axis=1).values
scale = StandardScaler()
X = scale.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

from imblearn.over_sampling import SMOTE

smox, smoy = SMOTE.fit_sample(X_train, y_train)
smox_vis = pca.transform(smox)

parameters = {'C': [0.001, 0.01,0.1, 0.5,1]}

clf = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, scoring='roc_auc')
clf.fit(smox,smoy)
clf.grid_scores_
y_hat = clf.predict(X_test)




TypeError: fit_sample() missing 1 required positional argument: 'y'