In [None]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from keras.wrappers.scikit_learn import KerasClassifier
import matplotlib.pyplot as plt
from sklearn.utils.fixes import signature
from sklearn.svm import SVC  
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, average_precision_score
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense, Dropout
import shap
from keras.metrics import binary_accuracy
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
from keras.callbacks import EarlyStopping
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# comments = pd.read_csv('../clean_data/x_no_index.csv')
annotations = pd.read_csv('../clean_data/y_more_no_df_clean.csv')
# c = pd.read_csv('../clean_data/x_with_lace.csv')
# print(c.shape)
comments = pd.read_csv('../clean_data/x_with_lacefeatures.csv')
comments = comments.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
# annotations = pd.read_csv('../clean_data/y_equal_yes.csv')
x_train, x_test, y_train, y_test = train_test_split(comments, annotations, test_size=.1)
print(comments.shape, annotations.shape)
print(list(comments))

In [None]:
def plot_pr_curve(y_test, y_score):
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
              average_precision))
    plt.show()
    plt.close()

In [None]:
def create_confusion_matrix(name, y_test, y_pred):
    score = accuracy_score(y_test, y_pred.round())
    cm = confusion_matrix(y_test, y_pred.round())
    fig2=plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = str(name)+': '+str(score)
    plt.title(all_sample_title, size = 15);
    plt.show()
    fig2.savefig('figures/'+name+'.png')

In [None]:
# def create_model():
#     model = Sequential()
#     model.add(Dense(2000,input_dim=67,activation='relu', name='dense_1'))
# #     model.add(Dropout(0.8, name='dropout_1'))
#     model.add(Dense(1,activation='relu', name='dense_2'))
# #     model.add(Dropout(0.2, name='dropout_2'))
# #     model.add(Dense(1,activation='relu', name='dense_3'))
# #     model.add(Dropout(0.2, name='dropout_2'))
# #     model.add(Dense(1,activation='relu', name='dense_3'))
# #     model.add(Dense(1,activation='relu', name='dense_4'))
# #     model.add(Dense(1,activation='relu', name='dense_5'))
#     #need to optimize beta_1, beta_2, epsilon, decay, amsgrad
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model
def create_model():
    model = Sequential()
    model.add(Dense(32,input_dim=70,activation='relu', name='dense_1'))
#     model.add(Dense(300,input_dim=68,activation='relu', name='dense_2'))
#     model.add(Dropout(0.2, name='dropout_1'))
#     model.add(Dense(500,activation='relu', name='dense_2'))
#     model.add(Dense(200,activation='relu', name='dense_3'))
#     model.add(Dense(100,activation='relu', name='dense_4'))
    model.add(Dense(1,activation='sigmoid', name='dense_5'))
#     model.add(Dropout(0.2, name='dropout_2'))
#     model.add(Dense(1,activation='relu', name='dense_3'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

earlystop = EarlyStopping(monitor='loss', min_delta=0.0001, patience=8, \
                          verbose=1, mode='auto')
callbacks_list = [earlystop]
model = create_model()
MODEL_NAME = 'Neural Network 1'
clf = KerasClassifier(build_fn=create_model, epochs=50, batch_size=32, verbose=1, callbacks=callbacks_list)
scaler = StandardScaler()
pipeline = Pipeline([('preprocess',scaler), ('clf',clf)])
history = pipeline.fit(x_train, y_train)
y_score_nn = pipeline.predict(x_test)
# history = clf.fit(x_train, y_train)
# y_score_nn = clf.predict(x_test)

average_precision = average_precision_score(y_test, y_score_nn.round())
acc_score_rf = sklearn.metrics.accuracy_score(y_test, y_score_nn.round())
create_confusion_matrix(MODEL_NAME, y_test, y_score_nn.round())
print(model.summary())
print(classification_report(y_test,y_score_nn.round())) 
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('Accuracy score: {0:0.2f}'.format(
      acc_score_rf))
from contextlib import redirect_stdout
with open('NN_figures/MODEL_NAME.txt', 'w') as f:
    with redirect_stdout(f):
        model.summary()
from keras.utils import plot_model
plot_model(model, to_file='NN_figures/'+MODEL_NAME+'.png')
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
from sklearn.externals import joblib

# Saving a model
joblib.dump(pipeline, 'nn_even_labels_model.sav')

In [None]:
def f_wrapper(X):
    return pipeline.predict(X).flatten()
X_train_summary = shap.kmeans(x_train, 20)
explainer = shap.KernelExplainer(f_wrapper, X_train_summary)
x_train_sample = x_train.sample(20)
x_test_sample = x_test.sample(20)
shap.initjs()
shap_values = explainer.shap_values(x_test_sample)
shap.summary_plot(shap_values, x_train_sample, plot_type="bar")
# import eli5
# from eli5.sklearn import PermutationImportance
# perm = PermutationImportance(clf, random_state=0).fit(x_train,y_train)
# eli5.show_weights(perm, feature_names = comments.columns.tolist())

In [None]:
rf = RandomForestClassifier(max_depth=3, n_estimators=10)
rf.fit(x_train, y_train)
y_score_rf = rf.predict_proba(x_test)[:, 1]
average_precision = average_precision_score(y_test, y_score_rf.round())
acc_score_rf = sklearn.metrics.accuracy_score(y_test, y_score_rf.round())


create_confusion_matrix('Random Forest', y_test, y_score_rf.round())
print(classification_report(y_test,y_score_rf.round())) 
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('Accuracy score: {0:0.2f}'.format(
      acc_score_rf))

In [None]:
classifier = SVC(kernel='linear')  
classifier.fit(x_train, y_train)  
y_score_svm = classifier.predict(x_test)
average_precision = average_precision_score(y_test, y_score_svm)
acc_score_svm = sklearn.metrics.accuracy_score(y_test, y_score_svm.round())

create_confusion_matrix('SVM', y_test, y_score_svm.round())
print(classification_report(y_test,y_score_svm.round()))
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('Accuracy score: {0:0.2f}'.format(
      acc_score_svm))

In [None]:
logisticRegr = LogisticRegression(penalty='l2', random_state=0)
logisticRegr.fit(x_train, y_train)
y_score_lr = logisticRegr.predict(x_test)
average_precision = average_precision_score(y_test, y_score_lr.round())
acc_score_lr = sklearn.metrics.accuracy_score(y_test, y_score_lr.round())

create_confusion_matrix('Logistic Regression', y_test, y_score_lr.round())
print(classification_report(y_test,y_score_lr.round()))
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('Accuracy score: {0:0.2f}'.format(
      acc_score_lr))

In [None]:
nb = GaussianNB()
nb.fit(x_train, y_train)
y_score_nb = nb.predict_proba(x_test)[:,1]
average_precision = average_precision_score(y_test, y_score_nb.round())
acc_score_nb = sklearn.metrics.accuracy_score(y_test, y_score_nb.round())

create_confusion_matrix('Naive Bayes', y_test, y_score_nb.round()) 
print(classification_report(y_test,y_score_nb.round()))
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))
print('Accuracy score: {0:0.2f}'.format(
      acc_score_nb))

In [None]:
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_score_lr)
auc_lr = auc(fpr_lr, tpr_lr)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_score_rf)
auc_rf = auc(fpr_rf, tpr_rf)
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_score_svm)
auc_svm = auc(fpr_svm, tpr_svm)
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test, y_score_nb)
auc_nb = auc(fpr_nb, tpr_nb)
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, y_score_nn)
auc_nn = auc(fpr_nn, tpr_nn)
fig=plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lr, tpr_lr, label='LR (area = {:.3f})'.format(auc_lr))
plt.plot(fpr_svm, tpr_svm, label='SVM (area = {:.3f})'.format(auc_svm))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.plot(fpr_nb, tpr_nb, label='NB (area = {:.3f})'.format(auc_nb))
plt.plot(fpr_nn, tpr_nn, label='NN (area = {:.3f})'.format(auc_nb))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
fig.savefig('figures/initial_ROC.png')
plt.close()
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_score_lr)
average_precision_lr = average_precision_score(y_test, y_score_lr)
precision_nb, recall_nb, _ = precision_recall_curve(y_test, y_score_nb)
average_precision_nb = average_precision_score(y_test, y_score_nb)
precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_score_svm)
average_precision_svm = average_precision_score(y_test, y_score_svm)
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_score_rf)
average_precision_rf = average_precision_score(y_test, y_score_rf)
precision_nn, recall_nn, _ = precision_recall_curve(y_test, y_score_nn)
average_precision_nn = average_precision_score(y_test, y_score_nn)

plt.plot(recall_lr, precision_lr, label='LR (area = {:.3f})'.format(average_precision_lr))
plt.plot(recall_svm, precision_svm, label='SVM (area = {:.3f})'.format(average_precision_svm))
plt.plot(recall_rf, precision_rf, label='RF (area = {:.3f})'.format(average_precision_rf))
plt.plot(recall_nb, precision_nb, label='NB (area = {:.3f})'.format(average_precision_nb))
plt.plot(recall_nb, precision_nb, label='NN (area = {:.3f})'.format(average_precision_nn))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PR curve')
plt.legend(loc='best')
plt.show()
fig.savefig('figures/PR_AUC.png')
plt.close()