In [1]:
from sklearn import datasets, model_selection, linear_model, metrics, preprocessing
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
% matplotlib inline

In [8]:
def train_model(clf_name, data, labels, random_state=10, cv=4, normalize=False):
    cv = model_selection.ShuffleSplit(n_splits=cv,random_state=random_state)
    accs = []
    rcls = []
    prcs = []
    conf_matr = np.zeros((np.unique(labels).size, np.unique(labels).size))
    for train_inds, test_inds in cv.split(data):
        if clf_name == 'log':
            clf = linear_model.SGDClassifier(loss='log', random_state=random_state)    
        train_data = data[train_inds]
        train_labels = labels[train_inds]
        test_data = data[test_inds]
        test_labels = labels[test_inds]
        if normalize:
            normalizer = preprocessing.Normalizer().fit(train_data)
            train_data = normalizer.transform(train_data)
            test_data = normalizer.transform(test_data)
        clf.fit(train_data, train_labels)
        accs.append(clf.score(test_data, test_labels))
        conf_matr += metrics.confusion_matrix(y_true=test_labels, y_pred=clf.predict(test_data))
        rcls.append(metrics.recall_score(y_true=test_labels, y_pred=clf.predict(test_data), pos_label=0))
        prcs.append(metrics.precision_score(y_true=test_labels, y_pred=clf.predict(test_data), pos_label=0))  
    mean_acc = np.array(accs).mean()
    mean_prc = np.array(prcs).mean()
    mean_rcl = np.array(rcls).mean()  
    return mean_acc, mean_rcl, mean_prc, conf_matr

In [12]:
bc = datasets.load_breast_cancer()
acc, prc, rcl, conf_matr = train_model('log', bc.data, bc.target, normalize=True)
print 'Accuracy:', acc
print 'Precision:', prc
print 'Recall:', rcl
print 'Confusion matrix: ' 
print conf_matr

Accuracy: 0.8771929824561404
Precision: 0.7784679089026916
Recall: 0.9200396825396826
Confusion matrix: 
[[ 70.  20.]
 [  8. 130.]]


In [14]:
imb_bc = pd.read_csv('./imbalanced_datasets/imbalanced_breast_cancer.csv')
imb_bc_labels = np.array(imb_bc.label)
imb_bc_data = np.array(imb_bc.drop('label', axis=1))        
        
acc, prc, rcl, conf_matr = train_model('log', imb_bc_data, imb_bc_labels, normalize=True)
print 'Accuracy:', acc
print 'Precision:', prc
print 'Recall:', rcl
print 'Confusion matrix: ' 
print conf_matr

Accuracy: 0.9605263157894737
Precision: 0.125
Recall: 0.25
Confusion matrix: 
[[  1.   6.]
 [  0. 145.]]
