In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

np.random.seed(42)

In [2]:
def get_cm(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = cm / cm.astype(float).sum(axis=1)
    np.round_(cm, decimals=4, out=cm)
    
    return cm

In [3]:
X_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_data.p')
y_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_labels.p')
y_train = y_train.values.ravel()
X_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_data.p')
y_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_labels.p')

In [4]:
logreg_pipe = Pipeline([('scaler', StandardScaler()), 
                        ('logreg', LogisticRegression(C=1e9, random_state=42))
                       ])

dtc_pipe = Pipeline([('scaler', StandardScaler()), 
                     ('dtc', DecisionTreeClassifier(random_state=42))
                    ])

knc_pipe = Pipeline([('scaler', StandardScaler()), 
                     ('knc', KNeighborsClassifier(n_neighbors=20))
                    ])

svc_pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svc', SVC(C=1e9, kernel='linear', random_state=42))
                    ])

In [5]:
dict_benchmark = {'1) model':[], '2) train_score':[], '3) test_score':[], '4) confusion_matrix':[]}

In [6]:
for model_name, my_pipe in [('logistic regression', logreg_pipe), ('decision tree classifier', dtc_pipe), \
                            ('kneighbors classifier', knc_pipe), ('support vector classifier', svc_pipe)]:
    
    my_pipe.fit(X_train, y_train)
    train_score = my_pipe.score(X_train, y_train)
    test_score = my_pipe.score(X_test, y_test)
    
    y_pred = my_pipe.predict(X_test)
    cm = get_cm(y_test, y_pred)
    
    dict_benchmark['1) model'].append(model_name)
    dict_benchmark['2) train_score'].append(train_score)
    dict_benchmark['3) test_score'].append(test_score)
    dict_benchmark['4) confusion_matrix'].append(cm)

pd.DataFrame(dict_benchmark)

Unnamed: 0,1) model,2) train_score,3) test_score,4) confusion_matrix
0,logistic regression,1.0,0.568333,"[[0.57, 0.43], [0.4333, 0.5667]]"
1,decision tree classifier,1.0,0.701667,"[[0.7067, 0.2933], [0.3033, 0.6967]]"
2,kneighbors classifier,0.642,0.595,"[[0.6, 0.4], [0.41, 0.59]]"
3,support vector classifier,1.0,0.565,"[[0.57, 0.43], [0.44, 0.56]]"
