In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser


HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
#change to class data folder
helper.cd_main_data()
os.listdir()

In [None]:
import pandas as pd
#load main Midas labelled data table
df = pd.read_csv('midas_labeled_data_Q12018.csv')

In [None]:
df.head()

In [None]:
df.columns

# Deep Neural Network

In [None]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
 
def clean_note(text):
    # Strip HTML tags
    text = re.sub('<[^<]+?>', ' ', text)
 
    # Strip escaped quotes
    text = text.replace('\\"', '')
 
    # Strip quotes
    text = text.replace('"', '')
 
    return text
 
# df = pd.read_csv('labeledTrainData.tsv', sep='\t', quoting=3)
df['midas_final_unstructured'].fillna('No Score', inplace=True)
# df.dropna(inplace=True)
df['cleaned_note_unstructured'] = df['cleaned_note_unstructured'].apply(clean_note)
df['category_id'] = df['midas_final_unstructured'].factorize()[0]

 

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_note_unstructured'], 
                                                    df['category_id'], test_size=0.2, random_state=2019)

# important for LIME to work
# X_train.reset_index(drop=True, inplace=True)
# y_train.reset_index(drop=True, inplace=True)
# X_test.reset_index(drop=True, inplace=True)
# y_test.reset_index(drop=True, inplace=True)

In [None]:
midas_final_unstructured_id_df = df[['midas_final_unstructured', 'category_id']].drop_duplicates().sort_values('category_id')

midas_final_unstructured_id_df.index = midas_final_unstructured_id_df.category_id
midas_final_unstructured_id_df.drop('category_id', axis=1, inplace=True)
class_dict = midas_final_unstructured_id_df.to_dict()['midas_final_unstructured']
class_dict

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

MAX_SEQ_LENGTH = len(max([i.split() for i in X_train.values], key=len))
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.9, max_features=MAX_SEQ_LENGTH)
X_train_onehot = vectorizer.fit_transform(X_train)
X_test_onehot = vectorizer.fit_transform(X_test)
MAX_SEQ_LENGTH

# DNN -model1

In [None]:
from keras.models import Sequential
from keras.layers import Dense
 
model = Sequential()
 
model.add(Dense(units=MAX_SEQ_LENGTH, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=6, activation='softmax'))
 #sparse is important
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_onehot[:-100], y_train[:-100], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_onehot[-100:], y_train[-100:]))
 

In [None]:
scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose=1)
print("Accuracy:", scores[1])  

# DNN - model 2

In [None]:
from keras.layers import Dropout
model = Sequential()
 
model.add(Dense(units=MAX_SEQ_LENGTH, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=int(2048), activation='relu'))
model.add(Dense(units=int(1024), activation='relu'))
model.add(Dense(units=int(512), activation='relu'))
model.add(Dense(units=int(256), activation='relu'))
model.add(Dense(units=int(128), activation='relu'))
model.add(Dense(units=int(64), activation='relu'))
model.add(Dense(units=6, activation='softmax'))
 #sparse is important
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:

def dnn_train(model, epoch):
    history = model.fit(X_train_onehot[:-100], y_train[:-100], 
              epochs=epoch, batch_size=128, verbose=1, 
              validation_data=(X_train_onehot[-100:], y_train[-100:]))
    scores = model.evaluate(vectorizer.transform(X_test), y_test, verbose=1)
    print("Accuracy:", scores[1]) 

# DNN - model 3

In [None]:
from keras.layers import Dropout
model = Sequential()
 
model.add(Dense(units=MAX_SEQ_LENGTH, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=int(2048), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(1024), activation='relu'))
model.add(Dense(units=int(512), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(256), activation='relu'))
model.add(Dense(units=int(128), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(64), activation='relu'))
model.add(Dense(units=6, activation='softmax'))
 #sparse is important
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
dnn_train(model, 3)

# DNN Model 4

In [None]:
from keras.layers import Dropout
model = Sequential()
 
model.add(Dense(units=MAX_SEQ_LENGTH, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=int(2048), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(1024), activation='relu'))
model.add(Dense(units=int(512), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(256), activation='relu'))
model.add(Dense(units=int(128), activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=int(64), activation='relu'))
model.add(Dense(units=6, activation='softmax'))
 #sparse is important
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
dnn_train(model, 6)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')
# plot loss during training
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show()

In [None]:
# model metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix
def get_metrics(model, testX, testy):
    yhat_probs = model.predict(testX, verbose=0)
    # predict crisp classes for test set
    yhat_classes = model.predict_classes(testX, verbose=0)
    print(yhat_classes)
    # reduce to 1d array
    yhat_probs = yhat_probs[:, 0]
#     yhat_classes = yhat_classes[:, 0]
    

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(testy, yhat_classes)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(testy, yhat_classes, average='micro')
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(testy, yhat_classes, average='micro')
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(testy, yhat_classes, average='micro')
    print('F1 score: %f' % f1)

    # kappa
    kappa = cohen_kappa_score(testy, yhat_classes)
    print('Cohens kappa: %f' % kappa)
    # ROC AUC
#     auc = roc_auc_score(testy, yhat_probs)
#     print('ROC AUC: %f' % auc)
    # confusion matrix
    matrix = confusion_matrix(testy, yhat_classes)
    print(matrix)

Prepare Text for prediction

In [None]:
cd class_tables/

In [None]:
#process for prediction
pred_df = pd.read_csv('midas_labeled_data_Q12018_NaN_only.csv', nrows=1000)
pred_df['cleaned_note_unstructured'].dropna(inplace=True)
pred_df = pd.DataFrame(pred_df['cleaned_note_unstructured'], columns=['cleaned_note_unstructured'])
type(pred_df)

In [None]:
def prediction_results(scores, df, class_dict):
    arg_max = []
    for i in scores:
        arg_max.append(i.argmax())

    df_prediction = df.copy(deep=True)
    df_prediction['midas_score'] = [class_dict[i] for i in arg_max]
    pd.options.display.max_colwidth = 2000
    return df_prediction

In [None]:
prediction = model.predict(vectorizer.transform(pred_df.squeeze()))

In [None]:
prediction

In [None]:
pred_resutls = prediction_results(prediction, pred_df, class_dict)
pred_resutls

In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, model)

In [None]:
class_names = class_dict.values
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
X_test[1]

In [None]:
X_test_onehot[0]

In [None]:
idx = 1
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=[0, 1])
print('Document id: %d' % idx)
print('Predicted class =', class_names[int(model.predict(X_test_onehot[idx]).reshape(1,-1)[0,0])])
print('True class: %s' % class_names[y_test[idx]])

In [None]:
print ('Explanation for class %s' % class_names[0])
print ('\n'.join(map(str, exp.as_list(label=0))))
print ()
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=17))))

In [None]:
class_dict

In [None]:
get_metrics(model, vectorizer.transform(pred_df.squeeze()), y_test[:1000])

In [None]:
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

def explain_the_model(idx, label=[0]):
    ''' idx: index of the row in pred_df
    pred_df - data on which the predictions to be made
    '''
    class_names = midas_final_unstructured_id_df['midas_final_unstructured'].values.tolist()
    print('Class names: {}'.format(class_names))
    explainer = LimeTextExplainer(class_names=class_names)
    c = make_pipeline(vectorizer, model)
    print(X_test[idx])
    exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=label)
    print('Document id: %d' % idx)
    print('Predicted class =', class_names[int(model.predict(
        vectorizer.transform(pred_df.squeeze())[idx]).reshape(1,-1)[0,0])])
    print('True class: %s' % class_names[y_test[idx]])
    print ('Explanation for class %s' % class_names[0])
    print ('\n'.join(map(str, exp.as_list(label=0))))
    print ()
    print ('Explanation for class %s' % class_names[1])
    print ('\n'.join(map(str, exp.as_list(label=1))))
    exp.show_in_notebook(text=False)
    return exp

In [None]:
from lime.lime_text import LimeTextExplainer
from sklearn.pipeline import make_pipeline

class_names = midas_final_unstructured_id_df['midas_final_unstructured'].values.tolist()
c = make_pipeline(vectorizer, model)
print('Class names: {}'.format(class_names))
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=6, labels=label)
print('Document id: %d' % idx)
print('Predicted class =', class_names[int(model.predict(
    vectorizer.transform(pred_df.squeeze())[idx]).reshape(1,-1)[0,0])])
print('True class: %s' % class_names[y_test[idx]])
print ('Explanation for class %s' % class_names[0])
print ('\n'.join(map(str, exp.as_list(label=0))))
print ()
print ('Explanation for class %s' % class_names[1])
print ('\n'.join(map(str, exp.as_list(label=1))))
exp.show_in_notebook(text=False)

In [None]:
X_train

In [None]:
pred_df['cleaned_note_unstructured'][40]

In [None]:
idx = 40#row number of unlabelled data
# exp = explain_the_model(idx, [0, 1])
exp.show_in_notebook(text=X_test[idx], labels=(0,))

In [None]:
from keras.models import load_model

# Creates a HDF5 file 'my_model.h5'
model.save('DNN_0.853.h5') # 250 gigs

# Deletes the existing model
# del model  

# Returns a compiled model identical to the previous one
# model = load_model('my_model.h5')

# Convolutional Neural Network

In [None]:

word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
 
def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes
 
print(to_sequence(tokenize, preprocess, word2idx, "This is an important test!"))  # [2269, 4453]
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_train]
print(X_train_sequences[0])

In [None]:

# Compute the max length of a text
MAX_SEQ_LENGTH = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGTH=", MAX_SEQ_LENGTH)
 
from keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(vectorizer.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)
print(X_train_sequences[0])

In [None]:

from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
 
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGTH))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=6, activation='softmax'))
 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:

model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=3, batch_size=512, verbose=1,
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

In [None]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) 

In [None]:
# model.save('CNN_0.664.h5')

# CNN Model 2

In [None]:

from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
 
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGTH))
model.add(Conv1D(256, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(64, 5, activation='relu'))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=6, activation='softmax'))
 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:

model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=3, batch_size=512, verbose=1,
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

In [None]:
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in X_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) 

# LSTM network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
 
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    64,  # Embedding size
                    input_length=MAX_SEQ_LENGTH))
model.add(LSTM(64))
model.add(Dense(units=6, activation='softmax'))
 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))

In [None]:
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1]) 
 

In [None]:
# model.save('LSTM_0.723.h5')

In [None]:
!python -m spacy download en

# spaCy Transfer Learning

In [None]:
import spacy
import numpy as np
nlp = spacy.load('en') #python -m spacy download en
 
EMBEDDINGS_LEN = len(nlp.vocab['apple'].vector)
print("EMBEDDINGS_LEN=", EMBEDDINGS_LEN)  # 300
 
embeddings_index = np.zeros((len(vectorizer.get_feature_names()) + 1, EMBEDDINGS_LEN))
for word, idx in word2idx.items():
    try:
        embedding = nlp.vocab[word].vector
        embeddings_index[idx] = embedding
    except:
        pass
 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
 
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    EMBEDDINGS_LEN,  # Embedding size
                    weights=[embeddings_index],
                    input_length=MAX_SEQ_LENGTH,
                    trainable=False))
model.add(LSTM(300))
model.add(Dense(units=6, activation='softmax'))
 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=1, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))
 
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Accuracy:", scores[1])  
 

# GloVe Transfer learning

In [None]:
helper.cd_main_data()
!wget -O glove.6B.zip http://nlp.stanford.edu/data/glove.6B.zip
helper.unzip('glove.6B.zip')

In [None]:
ls

In [None]:
import numpy as np
 
GLOVE_PATH = HOME_DIR+'/main_data/glove.6B/glove.6B.50d.txt'
GLOVE_VECTOR_LENGTH = 50
 
def read_glove_vectors(path, length):
    embeddings = {}
    with open(path) as glove_f:
        for line in glove_f:
            chunks = line.split()
            assert len(chunks) == length + 1
            embeddings[chunks[0]] = np.array(chunks[1:], dtype='float32')
 
    return embeddings
 
GLOVE_INDEX = read_glove_vectors(GLOVE_PATH, GLOVE_VECTOR_LENGTH)
 
# Init the embeddings layer with GloVe embeddings
embeddings_index = np.zeros((len(vectorizer.get_feature_names()) + 1, GLOVE_VECTOR_LENGTH))
for word, idx in word2idx.items():
    try:
        embedding = GLOVE_INDEX[word]
        embeddings_index[idx] = embedding
    except:
        pass
 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
 
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    GLOVE_VECTOR_LENGTH,  # Embedding size
                    weights=[embeddings_index],
                    input_length=MAX_SEQ_LENGTH,
                    trainable=False))
model.add(LSTM(128))
model.add(Dense(units=6, activation='softmax'))
 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
 
model.fit(X_train_sequences[:-100], y_train[:-100], 
          epochs=3, batch_size=128, verbose=1, 
          validation_data=(X_train_sequences[-100:], y_train[-100:]))
 
scores = model.evaluate(X_test_sequences, y_test, verbose=1)
print("Test Accuracy:", scores[1])  

In [None]:
# model.save('GloVe_0.671.h5')

In [None]:
#the model input shape should be the length of the longest text in the data
def prepare_text_for_pred(df):
    X_pred_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in df.squeeze()]
    MAX_SEQ_LENGTH = len(max(X_train_sequences, key=len))
    return pad_sequences(X_pred_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)
    

In [None]:
pred_df = pd.read_csv('midas_labeled_data_Q12018_grade1_only.csv', nrows=10)
pred_df.dropna(inplace=True)
pred_df = pd.DataFrame(pred_df['cleaned_note_unstructured'], columns=['cleaned_note_unstructured'])
type(pred_df)

In [None]:
X_pred_sequences = prepare_text_for_pred(pred_df)

In [None]:
scores = model.predict(X_pred_sequences)

In [None]:
print(scores)

In [None]:
def prediction_results(scores, df, class_dict):
    arg_max = []
    for i in scores:
        arg_max.append(i.argmax())

    df_prediction = df.copy(deep=True)
    df_prediction['midas_score'] = [class_dict[i] for i in arg_max]
    pd.options.display.max_colwidth = 2000
    return df_prediction

In [None]:
ls

In [None]:
df_prediction.shape

In [None]:
len(arg_max)
