In [3]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import pandas as pd

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.precision', 3)

In [5]:
from graphviz import Digraph
from time import time
from datetime import timedelta

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
X = pd.read_csv('../data/processed/X_train', header=0, index_col=0)
y = pd.read_csv('../data/processed/y_train', header=0, index_col=0)
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y[hd_split["target"]])
# # creating hd_train to be used for exploratory data analysis
# hd_train, hd_test = train_test_split(hd, test_size=0.2, random_state=42, stratify=y[hd_split["target"]])
# print(f"Test stratification is: {y_test.mean()}")
# print(f"Train stratification is: {y_train.mean()}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y[["target"]])
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(f"Test stratification is: {y_test.mean()}")
print(f"Train stratification is: {y_train.mean()}")

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# X = scaler.fit_transform(X)

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()
# y = y_train.values.ravel()

# Metrics + Baseline
## Playing with Metrics from ML07 lab

In [None]:
results_df = pd.DataFrame(index=['Logistic Regression'],
                          columns=['F-score (cv)', 'Precision (cv)','Recall (cv)', 'Accuracy (cv)'])

In [None]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [None]:
def graphMLP(vars,layers,intercepts):
    """
    Function for plotting the weights of a mlp
    """
    f = Digraph('')
    f.attr(rankdir='LR')
    for i,l in enumerate(layers):
        if i==0:
            for j in range(l.shape[1]):
                for k, v in enumerate(vars):
                    f.edge(v, 'L%dN%d'%(i,j), label=str(l[k,j]))
            f.node('ILI', shape='doublecircle')
            for k in range(intercepts[i].shape[0]):
                f.edge('ILI',
                       'L%dN%d'%(i,k),
                       label=str(intercepts[i][k]))
        else:
            for j in range(l.shape[1]):
                for k in range(layers[i-1].shape[1]):
                    f.edge('L%dN%d'%(i-1,k),
                           'L%dN%d'%(i,j),
                           label=str(l[k,j]))
            f.node('L%dI'%(i-1), shape='doublecircle')
            for k in range(intercepts[i].shape[0]):
                f.edge('L%dI'%(i-1),
                       'L%dN%d'%(i,k),
                       label=str(intercepts[i][k]))

    return f


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
scores = cross_val_score(logreg, X_train, y_train, cv=5)
y_pred_lr = logreg.predict(X_train)

cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['Logistic Regression',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

confusion(y_train,y_pred_lr)
results_df

In [None]:
rf = RandomForestClassifier(n_estimators=50, random_state=42,max_depth=4)
rf.fit(X_train, y_train)
scores = cross_val_score(rf, X_train, y_train, cv=5)
y_pred_lr = rf.predict(X_train)

cross_val_results = pd.DataFrame(cross_validate(rf , X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['Random Forest',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

confusion(y_train,y_pred_lr)
results_df

# Neural Network Model

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[1],
                           alpha=0,
                           activation='logistic',
                           max_iter=200,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train, y_train);

y_pred = model_nnet.predict(X_train)

print('Confusion matrix of the TRAIN dataset')
confusion(y_train, y_pred)

cross_val_results = pd.DataFrame(cross_validate(model_nnet , X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['MLP[1]',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

results_df.sort_values(by='F-score (cv)', ascending=False)

In [None]:
model_nnet.coefs_
model_nnet.intercepts_

In [None]:
# dataset = X.join(y)
# graphMLP(dataset.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)
pred = pd.Series(y_pred)
true = pd.Series(y_train)

true.name = 'target'
pred.name = 'predicted'
cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
# cm = cm[cm.index]
cm

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[2,2,2],
                           alpha=0,
                           activation='logistic',
                           max_iter=1000,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);
y_pred = model_nnet.predict(X_train)

print('Confusion matrix of the TRAIN dataset')
# confusion(y_train,y_pred)
# confusion_matrix(y_train, y_pred)
#TODO: It looks like the prediction output is always "0"? Something is wrong with this code but I can't figure out what!!!
# y_pred

cross_val_results = pd.DataFrame(cross_validate(model_nnet , X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['MLP[2,2]',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

results_df.sort_values(by='F-score (cv)', ascending=False)

pred = pd.Series(y_pred)
true = pd.Series(y_train)
# pred
# true

true.name = 'target'
pred.name = 'predicted'
cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
cm = cm[cm.index]

In [None]:
graphMLP(dataset.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)

In [None]:
model_nnet = MLPClassifier(hidden_layer_sizes=[2,2,2],
                           alpha=0.001,
                           activation='logistic',
                           max_iter=1000,
                           solver='lbfgs',random_state=42)
model_nnet.fit(X_train,y_train);
print('Confusion matrix of the TRAIN dataset')
confusion(y_train,y_pred)

cross_val_results = pd.DataFrame(cross_validate(model_nnet , X_train, y_train, cv = 5,
                                                scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['MLP[2,2,2]-alpha=0.001',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

results_df.sort_values(by='F-score (cv)', ascending=False)

In [None]:
graphMLP(dataset.columns[1:], model_nnet.coefs_, model_nnet.intercepts_)

# Tuning Parameters
Using a grid search

In [None]:
sizes = [2*i for i in range(1,5)]
sizes = sizes + [[2*i,2*i] for i in range(1,5)]
sizes = sizes + [[2*i,2*i, 2*i] for i in range(1,5)]

len(sizes), sizes


decays = [0.00001,0.0001,0.001,0.01,0.1,0.5,1]
len(decays), decays[:10]

In [None]:
init_time=time()
model_nnet = MLPClassifier(alpha=0,
                           activation='logistic',
                           max_iter=500,
                           solver='lbfgs',
                           random_state=42)

trc = GridSearchCV(estimator=model_nnet,
                   scoring=['f1_macro', 'precision_macro', 'recall_macro', 'accuracy'],
                   param_grid={'hidden_layer_sizes': sizes,
                              'alpha': decays},
                   cv=10,
                   return_train_score=True,
                   refit='f1_macro')

model_10CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time()-init_time)))

In [None]:
pd.DataFrame(model_10CV.cv_results_).loc[:,['param_alpha','param_hidden_layer_sizes',  'mean_test_f1_macro',
                                            'std_test_f1_macro','mean_test_precision_macro', 'std_test_precision_macro',
                                        'mean_test_accuracy','std_test_accuracy',]].sort_values(by='mean_test_f1_macro',
                                                                                                ascending=False).head()

In [None]:
model_10CV.best_params_
model_10CV.best_score_

In [None]:
cross_val_results = pd.DataFrame(cross_validate(model_10CV.best_estimator_ , X_train, y_train, cv = 5, scoring = [ 'f1_macro', 'precision_macro', 'recall_macro', 'accuracy'] ))

results_df.loc['MLP[2,2]-alpha=0.00001',:] = cross_val_results[['test_f1_macro',
       'test_precision_macro', 'test_recall_macro','test_accuracy']].mean().values

results_df.sort_values(by='F-score (cv)', ascending=False)

In [None]:
y_pred = model_10CV.predict(X_test)

confusion(y_test,y_pred)

print(classification_report(y_test, y_pred))