In [13]:
from sklearn import datasets, model_selection, linear_model, metrics, preprocessing, svm, neighbors, tree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
% matplotlib inline
import math

In [14]:
import warnings
warnings.simplefilter('ignore')

In [15]:
def get_prc_rcl(conf_matrix):
    prcs = ''
    rcls = ''
    f_meas = ''
    for i in range(conf_matrix.shape[0]):
        tmp_prc = round(conf_matrix[i][i] * 1.0 / sum(conf_matrix[i]), 3)
        tmp_rcl = round(conf_matrix[i][i] * 1.0 / sum(conf_matrix[:, i]), 3)
        if math.isnan(tmp_prc):
            tmp_prc = 0
        if math.isnan(tmp_rcl):
            tmp_rcl = 0    
        if tmp_prc == 0 and tmp_rcl == 0:
            tmp_f_meas = 0
        else:
            tmp_f_meas = round(2 * (tmp_prc * tmp_rcl/(tmp_prc + tmp_rcl)), 3)
        prcs += str(tmp_prc) 
        rcls += str(tmp_rcl) 
        f_meas += str(tmp_f_meas)
        if i < conf_matrix.shape[0] - 1:
            prcs += ' / '
            rcls += ' / '
            f_meas += ' / '
    return prcs, rcls, f_meas

In [34]:
def train_model(clf_name, data, target, labels=[0, 1], random_state=15, cv=5, standard=True):
    cv = model_selection.KFold(n_splits=cv,random_state=random_state)
    accs = []
    rcls = []
    prcs = []
    conf_matr = np.zeros((np.unique(target).size, np.unique(target).size))
    for train_inds, test_inds in cv.split(data):
        if clf_name == 'log':
            clf = linear_model.SGDClassifier(random_state=random_state, n_iter=10000)
        if clf_name == 'svm':
            clf = svm.SVC(kernel='linear', random_state=random_state)
        if clf_name == '5nn':
            clf = neighbors.KNeighborsClassifier(n_neighbors=5)
        if clf_name == 'tree':
            clf = tree.DecisionTreeClassifier(max_depth=10, random_state=random_state)
        train_data = data[train_inds]
        train_labels = target[train_inds]
        test_data = data[test_inds]
        test_labels = target[test_inds]
        if standard:
            scaler = preprocessing.MinMaxScaler().fit(train_data)
            train_data = scaler.transform(train_data)
            test_data = scaler.transform(test_data)
        clf.fit(train_data, train_labels)
        accs.append(clf.score(test_data, test_labels))
        conf_matr += metrics.confusion_matrix(y_true=test_labels, y_pred=clf.predict(test_data), labels=labels)
    mean_acc = np.array(accs).mean()
    prcs, rcls, f_meas = get_prc_rcl(conf_matrix=conf_matr)  
    return mean_acc, prcs, rcls, f_meas, conf_matr

In [35]:
def test_dataset(data, target, name, labels=[0, 1], standard=True):
    classifiers = ['log', 'svm', '5nn', 'tree']
    columns = ['dataset', 'algorithm', 'accuracy', 'precision', 'recall', 'F-measure']
    df = pd.DataFrame(columns=columns)    
    for clf in classifiers:
        acc, prcs, rcls, f_meas, conf_matr = train_model(clf_name=clf, data=data, target=target, standard=standard, labels=labels)
        row = [name, clf, round(acc, 3), prcs, rcls, f_meas]
        df.loc[classifiers.index(clf)] = row
    return df        

In [36]:
bc = datasets.load_breast_cancer()
test_dataset(bc.data, bc.target, name = 'bc', labels=[0, 1], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,bc,log,0.968,0.943 / 0.983,0.971 / 0.967,0.957 / 0.975
1,bc,svm,0.975,0.943 / 0.994,0.99 / 0.967,0.966 / 0.98
2,bc,5nn,0.961,0.925 / 0.983,0.97 / 0.956,0.947 / 0.969
3,bc,tree,0.905,0.896 / 0.91,0.856 / 0.937,0.876 / 0.923


In [37]:
iris = datasets.load_iris()
test_dataset(iris.data, iris.target, name = 'iris', labels = [0, 1, 2], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,iris,log,0.847,0.98 / 0.8 / 0.76,1.0 / 0.755 / 0.792,0.99 / 0.777 / 0.776
1,iris,svm,0.807,1.0 / 0.7 / 0.72,1.0 / 0.714 / 0.706,1.0 / 0.707 / 0.713
2,iris,5nn,0.913,1.0 / 0.9 / 0.84,1.0 / 0.849 / 0.894,1.0 / 0.874 / 0.866
3,iris,tree,0.907,1.0 / 0.92 / 0.8,1.0 / 0.821 / 0.909,1.0 / 0.868 / 0.851


In [38]:
imb_bc = pd.read_csv('./imbalanced_datasets/imbalanced_breast_cancer.csv')
imb_bc_data = np.array(imb_bc.iloc[:,:-1])
imb_bc_label = np.array(imb_bc.iloc[:,-1])
# print(imb_bc.loc[imb_bc['label'] == 0].shape[0])
test_dataset(imb_bc_data, imb_bc_label, name='imb_bc', labels=[0, 1], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,imb_bc,log,0.992,0.895 / 0.997,0.944 / 0.994,0.919 / 0.995
1,imb_bc,svm,0.987,0.737 / 1.0,1.0 / 0.986,0.849 / 0.993
2,imb_bc,5nn,0.979,0.579 / 1.0,1.0 / 0.978,0.733 / 0.989
3,imb_bc,tree,0.981,0.842 / 0.989,0.8 / 0.992,0.82 / 0.99


In [39]:
imb_iris = pd.read_csv('./imbalanced_datasets/imbalanced_iris.csv')
imb_iris_data = np.array(imb_iris.iloc[:,:-1])
imb_iris_label = np.array(imb_iris.iloc[:,-1])
test_dataset(imb_iris_data, imb_iris_label, name='imb_iris', labels=[0, 1, 2], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,imb_iris,log,0.905,0.0 / 0.94 / 0.96,0 / 0.87 / 0.941,0 / 0.904 / 0.95
1,imb_iris,svm,0.933,0.8 / 0.96 / 0.92,1.0 / 0.906 / 0.958,0.889 / 0.932 / 0.939
2,imb_iris,5nn,0.895,0.4 / 0.94 / 0.9,1.0 / 0.855 / 0.938,0.571 / 0.895 / 0.919
3,imb_iris,tree,0.905,1.0 / 0.92 / 0.88,1.0 / 0.885 / 0.917,1.0 / 0.902 / 0.898


In [40]:
bc_sm = pd.read_csv('./smote_datasets/smote_bc.csv')
bc_sm_data = np.array(bc_sm.iloc[:,:-1])
bc_sm_label = np.array(bc_sm.iloc[:,-1])
test_dataset(bc_sm_data, bc_sm_label, name='smote_imb_bc', labels=[0, 1], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,smote_imb_bc,log,0.964,0.944 / 0.983,0.982 / 0.949,0.963 / 0.966
1,smote_imb_bc,svm,0.953,0.915 / 0.989,0.987 / 0.924,0.95 / 0.955
2,smote_imb_bc,5nn,0.948,0.918 / 0.978,0.975 / 0.926,0.946 / 0.951
3,smote_imb_bc,tree,0.956,0.991 / 0.922,0.924 / 0.991,0.956 / 0.955


In [41]:
iris_sm = pd.read_csv('./smote_datasets/smote_iris.csv')
iris_sm_data = np.array(iris_sm.iloc[:,:-1])
iris_sm_label = np.array(iris_sm.iloc[:,-1])
test_dataset(iris_sm_data, iris_sm_label, name='smote_imb_iris', labels=[0, 1, 2], standard=True)

Unnamed: 0,dataset,algorithm,accuracy,precision,recall,F-measure
0,smote_imb_iris,log,0.913,1.0 / 0.88 / 0.86,0.98 / 0.863 / 0.896,0.99 / 0.871 / 0.878
1,smote_imb_iris,svm,0.947,1.0 / 0.98 / 0.86,1.0 / 0.875 / 0.977,1.0 / 0.925 / 0.915
2,smote_imb_iris,5nn,0.96,1.0 / 0.96 / 0.92,1.0 / 0.923 / 0.958,1.0 / 0.941 / 0.939
3,smote_imb_iris,tree,0.947,1.0 / 0.9 / 0.94,1.0 / 0.938 / 0.904,1.0 / 0.919 / 0.922
