Test SMOTE with different classifiers

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [94]:
import os, pickle, itertools
import numpy as np

import smote_variants as sv

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import sklearn.datasets as datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import utils

## Try smote_variants with brain and ovary using disease labels

In [24]:
filepath = '../data/multiclass_brain_ovary.pkl'
with open(filepath, 'rb') as fin:
    data = pickle.load(fin)

In [25]:
data['disease']

array(['84413', '84413', '84413', ..., '80103', '80103', '80103'],
      dtype=object)

In [31]:
disease_num = dict([(y,x+1) for x,y in enumerate(sorted(set(data['disease'])))])

In [32]:
disease_num = np.array([disease_num[x] for x in data['disease']])

In [7]:
oversampler= sv.MulticlassOversampling(sv.polynom_fit_SMOTE())

In [None]:
X_samp, y_samp= oversampler.sample(data['data'], disease_num)

In [54]:
model = DecisionTreeClassifier()

In [83]:
scores = cross_val_score(model, X_samp, y_samp, cv=10, n_jobs=-1, scoring='f1_macro')

In [84]:
scores

array([0.40373992, 0.8841628 , 0.87359003, 0.82316721, 0.85850258,
       0.97351041, 0.95986294, 0.9544626 , 0.89927534, 0.88191745])

In [77]:
x_train, x_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.30, random_state=50)

In [78]:
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [79]:
y_pred = model.predict(x_test)

In [64]:
metrics.confusion_matrix(y_test, y_pred)

array([[353,   6,  33,   0,   1,   8,   3,   2,   5,  36,   4,   1,   4],
       [ 17, 501,   7,   3,   0,   0,   3,   2,   1,   5,   0,   0,   0],
       [ 45,   6, 462,   4,   2,   8,   5,   4,   6,  19,   9,   2,   2],
       [  5,   3,   3, 508,   7,   1,   0,   0,   0,   2,   0,   0,   0],
       [  0,   0,   0,   4, 514,   0,   0,   1,   0,   1,   0,   0,   0],
       [  9,   2,   6,   0,   0, 494,   0,   4,   6,  16,   1,   1,   1],
       [  1,   0,   2,   0,   0,   0, 487,   3,   6,   8,  10,   6,   1],
       [  2,   0,   2,   1,   0,   0,   8, 469,   4,   9,   3,   7,   0],
       [ 14,   0,   3,   0,   1,   4,   3,   4, 462,  24,   2,   6,   0],
       [ 31,   3,  24,   2,   5,  12,  10,  18,  24, 341,  17,   7,   4],
       [  3,   1,   3,   0,   0,   1,  10,   4,   8,  12, 481,   6,   0],
       [  1,   0,   1,   0,   0,   0,   5,   0,   1,   6,   5, 555,   0],
       [  3,   1,   2,   0,   1,   1,   0,   1,   0,   4,   0,   0, 497]])

In [74]:
metrics.f1_score(y_test, y_pred, average='macro')

0.8956604661032453

In [80]:
metrics.matthews_corrcoef(y_test, y_pred)

0.8912157174654356

In [6]:
sv.get_all_oversamplers_multiclass()

[smote_variants._smote_variants.SMOTE,
 smote_variants._smote_variants.Borderline_SMOTE1,
 smote_variants._smote_variants.Borderline_SMOTE2,
 smote_variants._smote_variants.LLE_SMOTE,
 smote_variants._smote_variants.distance_SMOTE,
 smote_variants._smote_variants.SMMO,
 smote_variants._smote_variants.polynom_fit_SMOTE,
 smote_variants._smote_variants.ADOMS,
 smote_variants._smote_variants.Safe_Level_SMOTE,
 smote_variants._smote_variants.MSMOTE,
 smote_variants._smote_variants.SMOBD,
 smote_variants._smote_variants.TRIM_SMOTE,
 smote_variants._smote_variants.SMOTE_RSB,
 smote_variants._smote_variants.ProWSyn,
 smote_variants._smote_variants.SL_graph_SMOTE,
 smote_variants._smote_variants.NRSBoundary_SMOTE,
 smote_variants._smote_variants.LVQ_SMOTE,
 smote_variants._smote_variants.SOI_CJ,
 smote_variants._smote_variants.ROSE,
 smote_variants._smote_variants.SMOTE_OUT,
 smote_variants._smote_variants.SMOTE_Cosine,
 smote_variants._smote_variants.Selected_SMOTE,
 smote_variants._smote_var

## Data without others, using site as labels

In [86]:
filepath = '../data/multiclass_sites_wo_others.pkl'
with open(filepath, 'rb') as fin:
    data = pickle.load(fin)

In [87]:
num_labels = dict([(y,x+1) for x,y in enumerate(set(data['site']))])
num_labels = np.array([num_labels[x] for x in data['site']])

In [91]:
X_samp, y_samp= oversampler.sample(data['data'], num_labels)

2020-02-11 10:48:09,764:INFO:MulticlassOversampling: Running multiclass oversampling with strategy equalize_1_vs_many_successive
2020-02-11 10:48:09,952:INFO:MulticlassOversampling: Sampling minority class with label: 7
2020-02-11 10:48:10,156:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': None}")
2020-02-11 10:48:10,986:INFO:MulticlassOversampling: Sampling minority class with label: 5
2020-02-11 10:48:11,283:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.3549586776859504, 'topology': 'star', 'random_state': None}")
2020-02-11 10:48:12,160:INFO:MulticlassOversampling: Sampling minority class with label: 2
2020-02-11 10:48:12,523:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.2478920741989882, 'topology': 'star', 'random_state': None}")
2020-02-11 10:48:13,646:INFO:MulticlassOversampling: Sampling minority class with label: 3
2020-02-1

In [112]:
X_samp, y_samp = utils.shuffle(X_samp, y_samp)

In [113]:
scores = cross_val_score(model, X_samp, y_samp, cv=5, n_jobs=-1, scoring='f1_macro')

In [114]:
scores

array([0.80970462, 0.80326647, 0.80250945, 0.80353631, 0.80664082])

## Data using disease labels, without "Others" and "None"

In [115]:
filepath = '../data/multiclass_disease_wo_others_none.pkl'
with open(filepath, 'rb') as fin:
    data = pickle.load(fin)

In [116]:
num_labels = dict([(y,x+1) for x,y in enumerate(set(data['disease']))])
num_labels = np.array([num_labels[x] for x in data['disease']])

In [117]:
X_samp, y_samp= oversampler.sample(data['data'], num_labels)

2020-02-11 12:19:58,095:INFO:MulticlassOversampling: Running multiclass oversampling with strategy equalize_1_vs_many_successive
2020-02-11 12:19:58,270:INFO:MulticlassOversampling: Sampling minority class with label: 7
2020-02-11 12:19:58,489:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': None}")
2020-02-11 12:19:59,422:INFO:MulticlassOversampling: Sampling minority class with label: 3
2020-02-11 12:19:59,716:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.41018431740081224, 'topology': 'star', 'random_state': None}")
2020-02-11 12:20:00,754:INFO:MulticlassOversampling: Sampling minority class with label: 20
2020-02-11 12:20:01,141:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.261827186237456, 'topology': 'star', 'random_state': None}")
2020-02-11 12:20:02,386:INFO:MulticlassOversampling: Sampling minority class with label: 34
2020-02

2020-02-11 12:23:44,260:INFO:MulticlassOversampling: Sampling minority class with label: 26
2020-02-11 12:23:49,077:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.030945245624782687, 'topology': 'star', 'random_state': None}")
2020-02-11 12:23:59,386:INFO:MulticlassOversampling: Sampling minority class with label: 11
2020-02-11 12:24:04,434:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.030022798060559356, 'topology': 'star', 'random_state': None}")


In [118]:
X_samp, y_samp = utils.shuffle(X_samp, y_samp)

In [119]:
scores = cross_val_score(model, X_samp, y_samp, cv=5, n_jobs=-1, scoring='f1_macro')

In [120]:
scores

array([0.93921257, 0.93762157, 0.93861318, 0.9375658 , 0.93672241])

## Data with independent test, site as label

In [126]:
filepath = '../data/multiclass_independent_data_site.pkl'
with open(filepath, 'rb') as fin:
    data = pickle.load(fin)

In [127]:
num_labels = dict([(y,x+1) for x,y in enumerate(set(data['site_train']))])
labels_train = np.array([num_labels[x] for x in data['site_train']])
labels_test = np.array([num_labels[x] for x in data['site_test']])

In [128]:
X_samp, y_samp= oversampler.sample(data['data_train'], labels_train)

2020-02-11 16:58:07,113:INFO:MulticlassOversampling: Running multiclass oversampling with strategy equalize_1_vs_many_successive
2020-02-11 16:58:07,303:INFO:MulticlassOversampling: Sampling minority class with label: 7
2020-02-11 16:58:07,514:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 1.0, 'topology': 'star', 'random_state': None}")
2020-02-11 16:58:08,468:INFO:MulticlassOversampling: Sampling minority class with label: 5
2020-02-11 16:58:08,774:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.3547786512205213, 'topology': 'star', 'random_state': None}")
2020-02-11 16:58:09,646:INFO:MulticlassOversampling: Sampling minority class with label: 2
2020-02-11 16:58:10,029:INFO:polynom_fit_SMOTE: Running sampling via ('polynom_fit_SMOTE', "{'proportion': 0.2477993488484264, 'topology': 'star', 'random_state': None}")
2020-02-11 16:58:11,210:INFO:MulticlassOversampling: Sampling minority class with label: 3
2020-02-1

In [129]:
X_samp, y_samp = utils.shuffle(X_samp, y_samp)

In [130]:
scores = cross_val_score(model, X_samp, y_samp, cv=5, n_jobs=-1, scoring='f1_macro')

In [131]:
scores

array([0.80927943, 0.81967044, 0.81258139, 0.81608486, 0.8187457 ])

In [132]:
model.fit(X_sampmp, y_samp)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [133]:
y_predict = model.predict(data['data_test'])

In [135]:
metrics.f1_score(labels_test, y_predict, average=None)

  'recall', 'true', average, warn_for)


array([0.        , 0.        , 0.08      , 0.10526316, 0.        ,
       0.30769231, 0.        , 0.0661157 , 0.        , 0.        ])

In [136]:
metrics.multilabel_confusion_matrix(labels_test, y_predict)

array([[[422,  24],
        [  0,   0]],

       [[417,  29],
        [  0,   0]],

       [[422,  23],
        [  0,   1]],

       [[392,  10],
        [ 41,   3]],

       [[370,  76],
        [  0,   0]],

       [[ 50,   1],
        [323,  72]],

       [[388,  58],
        [  0,   0]],

       [[329, 111],
        [  2,   4]],

       [[425,  21],
        [  0,   0]],

       [[433,  13],
        [  0,   0]]])

In [139]:
metrics.accuracy_score(labels_test, y_predict)

0.17937219730941703