In [1]:
from sklearn import datasets, model_selection, linear_model, metrics, preprocessing, svm, neighbors, tree
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
% matplotlib inline

In [2]:
def train_model(clf_name, data, target, labels=[0, 1], random_state=5, cv=5, imbalanced_label=0, normalize=False):
    cv = model_selection.ShuffleSplit(n_splits=cv,random_state=random_state, test_size=1.0 / cv)
    accs = []
    rcls = []
    prcs = []
    conf_matr = np.zeros((np.unique(target).size, np.unique(target).size))
    for train_inds, test_inds in cv.split(data):
        if clf_name == 'log':
            clf = linear_model.LogisticRegression(random_state=random_state) 
        if clf_name == 'svm_lin':
            clf = svm.SVC(kernel = 'linear', random_state=random_state)
        if clf_name == '5nn':
            clf = neighbors.KNeighborsClassifier(n_neighbors=5)
        if clf_name == 'tree':
            clf = tree.DecisionTreeClassifier(max_depth=10, random_state=random_state)
        train_data = data[train_inds]
        train_labels = target[train_inds]
        test_data = data[test_inds]
        test_labels = target[test_inds]
        if normalize:
            normalizer = preprocessing.Normalizer().fit(train_data)
            train_data = normalizer.transform(train_data)
            test_data = normalizer.transform(test_data)
        clf.fit(train_data, train_labels)
        accs.append(clf.score(test_data, test_labels))
        conf_matr += metrics.confusion_matrix(y_true=test_labels, y_pred=clf.predict(test_data), labels=labels)
        rcls.append(metrics.recall_score(y_true=test_labels, y_pred=clf.predict(test_data), pos_label=imbalanced_label, labels=labels, average='weighted'))
        prcs.append(metrics.precision_score(y_true=test_labels, y_pred=clf.predict(test_data), pos_label=imbalanced_label, labels=labels, average='weighted'))  
    mean_acc = np.array(accs).mean()
    mean_prc = np.array(prcs).mean()
    mean_rcl = np.array(rcls).mean()  
    return mean_acc, mean_rcl, mean_prc, conf_matr

In [3]:
def test_dataset(data, target, labels=[0, 1], normalize=False):
    classifiers = ['log', 'svm_lin','5nn', 'tree']
    columns = ['classifier', 'algorithm', 'accuracy', 'precision', 'recall']
    df = pd.DataFrame(columns=columns)    
    for clf in classifiers:
        acc, prc, rcl, conf_matr = train_model(clf_name=clf, data=data, target=target, normalize=normalize, labels=labels)
        row = ['bc', clf, round(acc, 3), round(prc, 3), round(rcl, 3)]
        df.loc[classifiers.index(clf)] = row
    return df        

In [4]:
bc = datasets.load_breast_cancer()
test_dataset(bc.data, bc.target, labels=[0, 1], normalize=True)



Unnamed: 0,classifier,algorithm,accuracy,precision,recall
0,bc,log,0.76,0.76,0.828
1,bc,svm_lin,0.825,0.825,0.864
2,bc,5nn,0.932,0.932,0.933
3,bc,tree,0.96,0.96,0.96


In [5]:
iris = datasets.load_iris()
test_dataset(iris.data, iris.target, labels = [0, 1, 2], normalize=False)

Unnamed: 0,classifier,algorithm,accuracy,precision,recall
0,bc,log,0.88,0.88,0.906
1,bc,svm_lin,0.96,0.96,0.965
2,bc,5nn,0.947,0.947,0.957
3,bc,tree,0.927,0.927,0.948


In [6]:
wine = datasets.load_wine()
test_dataset(wine.data, wine.target, labels = [0, 1, 2], normalize=False)

Unnamed: 0,classifier,algorithm,accuracy,precision,recall
0,bc,log,0.944,0.944,0.949
1,bc,svm_lin,0.933,0.933,0.941
2,bc,5nn,0.7,0.7,0.714
3,bc,tree,0.85,0.85,0.872
