## Imports

In [1]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

## Data set set-up

In [2]:
data = np.loadtxt('../data/original_data.csv', delimiter=' ')

In [6]:
np.random.shuffle(data)

In [7]:
len(data)

236698

In [44]:
data_sub = data[:]

In [45]:
X = np.asmatrix(data_sub[:,:-1])
y = data_sub[:,-1:].flatten()
print(X.shape)
print(y.shape)

(236698, 14)
(236698,)


In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

## ROC plotting support

In [47]:
from sklearn.metrics import roc_curve

def draw_roc(clf, X_train, X_test, Y_train, Y_test, color, label):
    # train classifier
    clf.fit(X_train,Y_train)
    Y_labels = clf.predict(X_test)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(Y_test,Y_labels)
    plt.plot(fpr,tpr, color=color, label=label)

def draw_rocs(clf, clf_name, X_train, X_smote_train, X_test, Y_train, Y_smote_train, Y_test):
    fig = plt.figure()
    
    # create plot
    draw_roc(clf, X_train, X_test, Y_train, Y_test, 'darkorange', 'UNSMOTEd')
    draw_roc(clf, X_smote_train, X_test, Y_smote_train, Y_test, 'navy', 'SMOTEd')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title(clf_name)

    plt.show()
    fig.savefig('ROC_%s.png' % clf_name)

## Compare SMOTEd and unSMOTEd ROCs

In [None]:
sm = SMOTE(random_state=0)
X_smote_train, y_smote_train = sm.fit_sample(X_train,y_train)
draw_rocs(LinearSVC(), 'Linear SVM', X_train, X_smote_train, X_test, y_train, y_smote_train, y_test)

## Predict using SMOTE+EEN and Linear SVM

In [31]:
from sklearn.metrics import confusion_matrix
clf = make_pipeline(SMOTEENN(random_state=0), LinearSVC())
# clf = LinearSVC(class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

Pipeline(memory=None,
     steps=[('smoteenn', SMOTEENN(enn=None, k=None, kind_enn=None, kind_smote=None, m=None,
     n_jobs=None, n_neighbors=None, out_step=None, random_state=0,
     ratio='auto', size_ngh=None, smote=None)), ('linearsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])