In [1]:
import numpy as np
import pandas as pd
import multivariate_os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn import over_sampling as os
from numpy.random import multivariate_normal
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import roc_curve, auc

In [2]:
data = pd.read_csv("/home/yura/Desktop/out_poker-8_vs_6.csv")
X = data.drop('Label', axis=1)
y = data.Label
pos = data[data.Label == 1]
pos = pos.drop('Label', axis=1)

In [3]:

RANDOM_STATE = 42
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
cnt = Counter(y_train)
num_minority = int((cnt[0] - cnt[1]))
print('y_train: {}'.format(Counter(y_train)))
print('y_test: {}'.format(Counter(y_test)))
print(num_minority)

y_train: Counter({0: 873, 1: 13})
y_test: Counter({0: 587, 1: 4})
860


In [4]:
zero_list, zero_mean, pos = multivariate_os.find_zerostd(pos)
df = multivariate_os.mnd_os(zero_list, zero_mean, pos, num_minority)

[]
[]
          S1        C1        S2        C2        S3        C3        S4  \
S1  1.000000  0.010273  1.000000  0.016300  1.000000  0.093249  1.000000   
C1  0.010273  1.000000  0.010273  0.820168  0.010273  0.811093  0.010273   
S2  1.000000  0.010273  1.000000  0.016300  1.000000  0.093249  1.000000   
C2  0.016300  0.820168  0.016300  1.000000  0.016300  0.731053  0.016300   
S3  1.000000  0.010273  1.000000  0.016300  1.000000  0.093249  1.000000   
C3  0.093249  0.811093  0.093249  0.731053  0.093249  1.000000  0.093249   
S4  1.000000  0.010273  1.000000  0.016300  1.000000  0.093249  1.000000   
C4  0.071425  0.788312  0.071425  0.767192  0.071425  0.831592  0.071425   
S5  1.000000  0.010273  1.000000  0.016300  1.000000  0.093249  1.000000   
C5  0.051368  0.586248  0.051368  0.781875  0.051368  0.706733  0.051368   

          C4        S5        C5  
S1  0.071425  1.000000  0.051368  
C1  0.788312  0.010273  0.586248  
S2  0.071425  1.000000  0.051368  
C2  0.767192  0.0

# Learning

In [5]:
# Load multivariate data
# df + traindata
mlpd = pd.read_csv("/home/yura/Desktop/mlpd_train.csv")
X_mlpd = mlpd.drop('Label', axis=1)
y_mlpd = mlpd.Label
print('y_mlpd: {}'.format(Counter(y_mlpd)))
X_mlpd = pd.concat([X_mlpd, X_train])
y_mlpd = pd.concat([y_mlpd, y_train])
print('y_mlpd: {}'.format(Counter(y_mlpd)))

y_mlpd: Counter({1: 860})
y_mlpd: Counter({1: 873, 0: 873})


In [6]:
#Apply over-sampling
sm_reg = os.SMOTE(kind='regular', random_state=RANDOM_STATE)
ada = os.ADASYN(random_state=RANDOM_STATE)
rand = os.RandomOverSampler(random_state=RANDOM_STATE)

X_reg, y_reg = sm_reg.fit_sample(X_train, y_train)
X_ada, y_ada = ada.fit_sample(X_train, y_train)
X_rand, y_rand = rand.fit_sample(X_train, y_train)
os_list = [[X_reg, y_reg], [X_rand, y_rand], [X_mlpd, y_mlpd]]

In [7]:
svm_clf = []
for i in range(len(os_list)):
    svm_clf.append(svm.SVC(random_state=RANDOM_STATE, probability=True).fit(os_list[i][0], os_list[i][1]))
    
for i in range(len(svm_clf)):
    print(classification_report_imbalanced(y_test, svm_clf[i].predict(X_test)))
    #calc auc
    prob = svm_clf[i].predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    roc_auc_area = auc(fpr, tpr)
    print('AUC={}'.format(roc_auc_area))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.99      0.00      0.99      0.00      0.00       587
          1       0.00      0.00      0.99      0.00      0.00      0.00         4

avg / total       0.99      0.99      0.01      0.99      0.00      0.00       591

AUC=0.614991482112436
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      1.00      0.00      0.99      0.00      0.00       587
          1       0.00      0.00      1.00      0.00      0.00      0.00         4

avg / total       0.99      0.99      0.01      0.99      0.00      0.00       591

AUC=0.6588586030664395
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.99      0.00      0.99      0.00      0.00       587
          1       0.00      0.00      0.99      0.00      0.00      0.00         4

avg / total       0.99      0.99  

  'precision', 'predicted', average, warn_for)
  warn_for)
  average, warn_for)


In [8]:
#Learning : k-NN
k=3
knn_clf = []
for i in range(len(os_list)):
    knn_clf.append(KNeighborsClassifier(n_neighbors=k).fit(os_list[i][0], os_list[i][1]))

for i in range(len(knn_clf)):
    print(classification_report_imbalanced(y_test, knn_clf[i].predict(X_test)))
    #calc auc
    prob = knn_clf[i].predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    roc_auc_area = auc(fpr, tpr)
    print('AUC={}'.format(roc_auc_area))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.92      0.25      0.96      0.15      0.02       587
          1       0.02      0.25      0.92      0.04      0.15      0.02         4

avg / total       0.99      0.92      0.25      0.95      0.15      0.02       591

AUC=0.5732538330494037
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.97      0.00      0.98      0.00      0.00       587
          1       0.00      0.00      0.97      0.00      0.00      0.00         4

avg / total       0.99      0.96      0.01      0.97      0.00      0.00       591

AUC=0.5909284497444633
                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.96      0.00      0.98      0.00      0.00       587
          1       0.00      0.00      0.96      0.00      0.00      0.00         4

avg / total       0.99      0.96 