In [1]:
from sklearn import datasets, model_selection, linear_model, metrics, preprocessing, svm, neighbors, tree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
% matplotlib inline
import math

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
def get_prc_rcl(conf_matrix):
    prcs = ''
    rcls = ''
    f_meas = ''
    for i in range(conf_matrix.shape[0]):
        tmp_prc = round(conf_matrix[i][i] * 1.0 / sum(conf_matrix[i]), 3)
        tmp_rcl = round(conf_matrix[i][i] * 1.0 / sum(conf_matrix[:, i]), 3)
        if math.isnan(tmp_prc):
            tmp_prc = 0
        if math.isnan(tmp_rcl):
            tmp_rcl = 0    
        if tmp_prc == 0 and tmp_rcl == 0:
            tmp_f_meas = 0
        else:
            tmp_f_meas = round(2 * (tmp_prc * tmp_rcl/(tmp_prc + tmp_rcl)), 3)
        prcs += str(tmp_prc) 
        rcls += str(tmp_rcl) 
        f_meas += str(tmp_f_meas)
        if i < conf_matrix.shape[0] - 1:
            prcs += ' / '
            rcls += ' / '
            f_meas += ' / '
    return prcs, rcls, f_meas

get_prc_rcl(np.array([[172, 40], [18, 339]]))

('0.811 / 0.95', '0.905 / 0.894', '0.855 / 0.921')

In [4]:
def train_model(clf_name, data, target, labels=[0, 1], random_state=0, cv=5, normalize=False):
    cv = model_selection.KFold(n_splits=cv,random_state=random_state)
    accs = []
    rcls = []
    prcs = []
    conf_matr = np.zeros((np.unique(target).size, np.unique(target).size))
    for train_inds, test_inds in cv.split(data):
        if clf_name == 'log':
            clf = linear_model.SGDClassifier(random_state=random_state, n_iter=10000)
        if clf_name == 'svm':
            clf = svm.SVC(kernel='linear', random_state=random_state)
        if clf_name == '5nn':
            clf = neighbors.KNeighborsClassifier(n_neighbors=5)
        if clf_name == 'tree':
            clf = tree.DecisionTreeClassifier(max_depth=10, random_state=random_state)
        train_data = data[train_inds]
        train_labels = target[train_inds]
        test_data = data[test_inds]
        test_labels = target[test_inds]
        if normalize:
            normalizer = preprocessing.Normalizer().fit(train_data)
            train_data = normalizer.transform(train_data)
            test_data = normalizer.transform(test_data)
        clf.fit(train_data, train_labels)
        accs.append(clf.score(test_data, test_labels))
        conf_matr += metrics.confusion_matrix(y_true=test_labels, y_pred=clf.predict(test_data), labels=labels)
    mean_acc = np.array(accs).mean()
    prcs, rcls, f_meas = get_prc_rcl(conf_matrix=conf_matr)  
    return mean_acc, prcs, rcls, f_meas, conf_matr

In [5]:
def test_dataset(data, target, name, labels=[0, 1], normalize=False):
    classifiers = ['log', 'svm', '5nn', 'tree']
    columns = ['dataset', 'algorithm', 'accuracy', 'precision', 'recall', 'F-measure']
    df = pd.DataFrame(columns=columns)    
    for clf in classifiers:
        acc, prcs, rcls, f_meas, conf_matr = train_model(clf_name=clf, data=data, target=target, normalize=normalize, labels=labels)
        row = [name, clf, round(acc, 3), prcs, rcls, f_meas]
        df.loc[classifiers.index(clf)] = row
    return df        

In [6]:
bc = datasets.load_breast_cancer()
test_dataset(bc.data, bc.target, name = 'bc', labels=[0, 1], normalize=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,bc,log,0.909,0.811 / 0.966,0.935 / 0.896,0.869 / 0.93
1,bc,svm,0.814,0.505 / 0.997,0.991 / 0.772,0.669 / 0.87
2,bc,5nn,0.912,0.849 / 0.95,0.909 / 0.914,0.878 / 0.932
3,bc,tree,0.924,0.896 / 0.941,0.9 / 0.939,0.898 / 0.94


In [7]:
iris = datasets.load_iris()
test_dataset(iris.data, iris.target, name = 'iris', labels = [0, 1, 2], normalize=False)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,iris,log,0.927,1.0 / 0.92 / 0.86,1.0 / 0.868 / 0.915,1.0 / 0.893 / 0.887
1,iris,svm,0.947,1.0 / 0.92 / 0.92,1.0 / 0.92 / 0.92,1.0 / 0.92 / 0.92
2,iris,5nn,0.913,1.0 / 0.9 / 0.84,1.0 / 0.849 / 0.894,1.0 / 0.874 / 0.866
3,iris,tree,0.907,1.0 / 0.88 / 0.84,1.0 / 0.846 / 0.875,1.0 / 0.863 / 0.857


In [33]:
wine = datasets.load_wine()
test_dataset(wine.data, wine.target, name='wine', labels = [0, 1, 2])

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,wine,log,0.832,0.915 / 0.662 / 0.979,0.73 / 0.94 / 0.87,0.812 / 0.777 / 0.921
1,wine,svm,0.91,0.949 / 0.887 / 0.896,0.903 / 0.926 / 0.896,0.925 / 0.906 / 0.896
2,wine,5nn,0.61,0.898 / 0.634 / 0.229,0.803 / 0.608 / 0.289,0.848 / 0.621 / 0.256
3,wine,tree,0.837,0.949 / 0.732 / 0.854,0.812 / 0.867 / 0.837,0.875 / 0.794 / 0.845


In [9]:
imb_bc = pd.read_csv('imbalanced_breast_cancer.csv')
imb_bc_data = np.array(imb_bc.iloc[:,:-1])
imb_bc_label = np.array(imb_bc.iloc[:,-1])
# print(imb_bc.loc[imb_bc['label'] == 0].shape[0])
test_dataset(imb_bc_data, imb_bc_label, name='imb_bc', labels=[0, 1], normalize=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,imb_bc,log,0.974,0.474 / 1.0,1.0 / 0.973,0.643 / 0.986
1,imb_bc,svm,0.95,0.0 / 1.0,0 / 0.949,0 / 0.974
2,imb_bc,5nn,0.979,0.632 / 0.997,0.923 / 0.981,0.75 / 0.989
3,imb_bc,tree,0.968,0.789 / 0.978,0.652 / 0.989,0.714 / 0.983


In [10]:
imb_iris = pd.read_csv('imbalanced_iris.csv')
imb_iris_data = np.array(imb_iris.iloc[:,:-1])
imb_iris_label = np.array(imb_iris.iloc[:,-1])
test_dataset(imb_iris_data, imb_iris_label, name='imb_iris', labels=[0, 1, 2], normalize=False)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,imb_iris,log,0.886,0.0 / 0.94 / 0.92,0 / 0.839 / 0.939,0 / 0.887 / 0.929
1,imb_iris,svm,0.971,1.0 / 0.96 / 0.98,1.0 / 0.98 / 0.961,1.0 / 0.97 / 0.97
2,imb_iris,5nn,0.905,0.4 / 0.92 / 0.94,1.0 / 0.885 / 0.922,0.571 / 0.902 / 0.931
3,imb_iris,tree,0.895,0.6 / 0.92 / 0.9,1.0 / 0.868 / 0.918,0.75 / 0.893 / 0.909


In [31]:
imb_wine = pd.read_csv('imbalanced_wine.csv')
imb_wine_data = np.array(imb_wine.iloc[:,:-1])
imb_wine_label = np.array(imb_wine.iloc[:,-1])
print(imb_wine.loc[imb_wine['label'] == 0].shape[0])
print(imb_wine.loc[imb_wine['label'] == 1].shape[0])
print(imb_wine.loc[imb_wine['label'] == 2].shape[0])
test_dataset(imb_wine_data, imb_wine_label, name='imb_wine', labels=[0, 1, 2], normalize=False)

59
71
7


Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,imb_wine,log,0.89,0.949 / 0.873 / 0.571,0.875 / 0.912 / 0.8,0.91 / 0.892 / 0.666
1,imb_wine,svm,0.948,0.983 / 0.944 / 0.714,0.906 / 0.985 / 1.0,0.943 / 0.964 / 0.833
2,imb_wine,5nn,0.883,0.966 / 0.901 / 0.0,0.905 / 0.877 / 0.0,0.935 / 0.889 / 0
3,imb_wine,tree,0.883,0.915 / 0.873 / 0.714,0.844 / 0.912 / 1.0,0.878 / 0.892 / 0.833


In [13]:
bc_sm = pd.read_csv('smoted_bc.csv')
bc_sm_data = np.array(bc_sm.iloc[:,:-1])
bc_sm_label = np.array(bc_sm.iloc[:,-1])
test_dataset(bc_sm_data, bc_sm_label, name='smote_imb_bc', labels=[0, 1], normalize=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,smote_imb_bc,log,0.901,0.848 / 0.955,0.95 / 0.861,0.896 / 0.906
1,smote_imb_bc,svm,0.867,0.776 / 0.961,0.952 / 0.809,0.855 / 0.878
2,smote_imb_bc,5nn,0.909,0.886 / 0.933,0.93 / 0.89,0.907 / 0.911
3,smote_imb_bc,tree,0.923,0.884 / 0.964,0.961 / 0.891,0.921 / 0.926


In [14]:
iris_sm = pd.read_csv('smoted_iris.csv')
iris_sm_data = np.array(iris_sm.iloc[:,:-1])
iris_sm_label = np.array(iris_sm.iloc[:,-1])
test_dataset(iris_sm_data, iris_sm_label, name='smote_imb_iris', labels=[0, 1, 2], normalize=False)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,smote_imb_iris,log,0.955,1.0 / 0.92 / 0.94,1.0 / 0.939 / 0.922,1.0 / 0.929 / 0.931
1,smote_imb_iris,svm,0.974,1.0 / 0.94 / 0.98,1.0 / 0.979 / 0.942,1.0 / 0.959 / 0.961
2,smote_imb_iris,5nn,0.961,1.0 / 0.94 / 0.94,1.0 / 0.94 / 0.94,1.0 / 0.94 / 0.94
3,smote_imb_iris,tree,0.961,1.0 / 0.92 / 0.96,1.0 / 0.958 / 0.923,1.0 / 0.939 / 0.941


In [30]:
wine_sm = pd.read_csv('smoted_wine.csv')
wine_sm_data = np.array(wine_sm.iloc[:,:-1])
wine_sm_label = np.array(wine_sm.iloc[:,-1])
print(wine_sm.loc[wine_sm['label'] == 0].shape[0])
print(wine_sm.loc[wine_sm['label'] == 1].shape[0])
print(wine_sm.loc[wine_sm['label'] == 2].shape[0])
test_dataset(wine_sm_data, wine_sm_label, name='smote_imb_wine', labels=[0, 1, 2], normalize=False)

59
71
77


Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,smote_imb_wine,log,0.391,0.288 / 0.803 / 0.091,0.279 / 0.655 / 0.119,0.283 / 0.721 / 0.103
1,smote_imb_wine,svm,0.593,0.559 / 0.93 / 0.312,0.402 / 0.904 / 0.462,0.468 / 0.917 / 0.372
2,smote_imb_wine,5nn,0.447,0.492 / 0.845 / 0.052,0.322 / 0.8 / 0.095,0.389 / 0.822 / 0.067
3,smote_imb_wine,tree,0.568,0.661 / 0.915 / 0.182,0.429 / 0.833 / 0.368,0.52 / 0.872 / 0.244
