In [None]:
import train_embeddings
import matplotlib.pyplot as plt
import sentence_features
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc, precision_recall_curve, average_precision_score
import pickle
from random import shuffle

import numpy as np
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.preprocessing import sequence

np.random.seed(1)

In [None]:
# Hyperparameters
embedding_dim = 250
num_convs = [64, 128, 256, 512]
hidden_dims = [2048, 1024, 512]
sz = 3

# Training parameters
batch_size = 64
num_epochs = 7

# Prepossessing parameters
sequence_length = 200

In [None]:
# Load Data
fastText_path = "../data/models/fastText/fastText_sw_cbow120.bin"
ft = train_embeddings.load_fastText_model(fastText_path)

mapped_reports_path = "../data/processed/processed_reports/preprocessed_findings_replace_sw"
mapped_reports = pickle.load(open(mapped_reports_path, 'rb'))
shuffle(mapped_reports)
pipeline = make_pipeline(train_embeddings.FastTextReportVectorizer(ft, granularity="word", pad_len=230), sentence_features.LabelSeparator(), None)
data, labels = pipeline.transform(mapped_reports)

split_point = int(0.85 * len(data))

trainingX = np.array(data[:split_point])
trainingY = np.array(labels[:split_point])
print(np.unique(trainingY, return_counts=True))

testingX = np.array(data[split_point:])
testingY = np.array(labels[split_point:])
print(np.unique(testingY, return_counts=True))

In [None]:
print("Training X shape: " + str(trainingX.shape))
print("Training Y shape: " + str(trainingY.shape))
print("Testing X shape: " + str(testingX.shape))
print("Testing Y shape: " + str(testingX.shape))

In [None]:
input_shape = (sequence_length, embedding_dim)

model_input = Input(shape=input_shape)

z = Convolution1D(filters=64, kernel_size=sz, padding="valid", activation="relu", strides=1)(model_input)

for nc in num_convs:
    z = Convolution1D(filters=nc, kernel_size=sz, padding="valid", activation="relu", strides=1)(z)
    z = Convolution1D(filters=nc, kernel_size=sz, padding="valid", activation="relu", strides=1)(z)
    z = MaxPooling1D(pool_size=2)(z)

z = Flatten()(z)
    
for hd in hidden_dims:
    z = Dense(hd, activation="relu")(z)
    z = Dropout(0.3)(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.summary()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.fit(trainingX, trainingY, batch_size=batch_size, epochs=num_epochs, validation_data=(testingX, testingY), verbose=1)

In [None]:
predictions = model.predict(testingX)

In [None]:
fpr, tpr, thresholds = roc_curve(testingY, predictions, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
model.save('../data/processed/vdcnn_120.h5')