In [1]:
import numpy as np
import pandas as pd
import multivariate_os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn import over_sampling
from numpy.random import multivariate_normal
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import roc_curve, auc
import re
from io import StringIO

In [2]:
data = pd.read_csv("Predataset/out_glass1.csv")
X = data.drop('Label', axis=1)
y = data.Label
pos = data[data.Label == 1]
pos = pos.drop('Label', axis=1)

In [3]:
RANDOM_STATE = 42
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
cnt = Counter(y_train)
num_minority = int((cnt[0] - cnt[1]))
print('y_train: {}'.format(Counter(y_train)))
print('y_test: {}'.format(Counter(y_test)))
print(num_minority)

y_train: Counter({0: 84, 1: 44})
y_test: Counter({0: 54, 1: 32})
40


In [4]:
zero_list, zero_mean, pos = multivariate_os.find_zerostd(pos)
df = multivariate_os.mnd_os(zero_list, zero_mean, pos, num_minority)

Searching zero std...
Finished.


# Learning

In [5]:
# Load multivariate data
# df + train data
X_mlpd = df.drop('Label', axis=1)
y_mlpd = df.Label
print('y_mlpd: {}'.format(Counter(y_mlpd)))
X_mlpd = pd.concat([X_mlpd, X_train])
y_mlpd = pd.concat([y_mlpd, y_train])
print('y_mlpd: {}'.format(Counter(y_mlpd)))

y_mlpd: Counter({1: 40})
y_mlpd: Counter({1: 84, 0: 84})


In [6]:
#Apply over-sampling
sm_reg = over_sampling.SMOTE(kind='regular', random_state=RANDOM_STATE)
ada = over_sampling.ADASYN(random_state=RANDOM_STATE)
rand = over_sampling.RandomOverSampler(random_state=RANDOM_STATE)

X_reg, y_reg = sm_reg.fit_sample(X_train, y_train)
X_ada, y_ada = ada.fit_sample(X_train, y_train)
X_rand, y_rand = rand.fit_sample(X_train, y_train)
os_list = [[X_train, y_train], [X_reg, y_reg],
           [X_ada, y_ada], [X_rand, y_rand], [X_mlpd, y_mlpd]]

In [7]:
def report_to_df(report):
    report = re.sub(r" +", " ", report).replace("avg / total", "avg/total").replace("\n ", "\n")
    report_df = pd.read_csv(StringIO("Classes" + report), sep=' ', index_col=0)        
    return(report_df)

In [8]:
svm_clf = []
svm_df = pd.DataFrame(index=[], columns=[])

for i in range(len(os_list)):
    svm_clf.append(svm.SVC(random_state=RANDOM_STATE, probability=True).fit(os_list[i][0], os_list[i][1]))
    
for i in range(len(svm_clf)):
    pred = classification_report_imbalanced(y_test, svm_clf[i].predict(X_test))
    svm_df = svm_df.append(report_to_df(pred))
    #calc auc
    prob = svm_clf[i].predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    roc_auc_area = auc(fpr, tpr)
    print('AUC={}'.format(roc_auc_area))

AUC=0.7083333333333334
AUC=0.7378472222222223
AUC=0.7239583333333333
AUC=0.7644675925925927
AUC=0.6984953703703703


In [10]:
#Learning : k-NN
k=3
knn_clf = []
knn_df = pd.DataFrame(index=[], columns=[])

for i in range(len(os_list)):
    knn_clf.append(KNeighborsClassifier(n_neighbors=k).fit(os_list[i][0], os_list[i][1]))
    
for i in range(len(knn_clf)):
    pred = classification_report_imbalanced(y_test, knn_clf[i].predict(X_test))
    knn_df = knn_df.append(report_to_df(pred))
    #calc auc
    prob = knn_clf[i].predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    roc_auc_area = auc(fpr, tpr)
    print('AUC={}'.format(roc_auc_area))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.75      0.78      0.56      0.76      0.67      0.46        54
          1       0.60      0.56      0.78      0.58      0.67      0.44        32

avg / total       0.69      0.70      0.64      0.70      0.67      0.45        86

AUC=0.7650462962962963
                   pre       rec       spe        f1       geo       iba       sup

          0       0.79      0.70      0.69      0.75      0.68      0.47        54
          1       0.58      0.69      0.70      0.63      0.68      0.45        32

avg / total       0.71      0.70      0.69      0.70      0.68      0.46        86

AUC=0.8032407407407407
                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.59      0.72      0.67      0.63      0.41        54
          1       0.51      0.72      0.59      0.60      0.63      0.39        32

avg / total       0.68      0.64 

In [12]:
svm_df.to_csv('output/svm_glass1.csv')
knn_df.to_csv('output/knn_glass1.csv')