In [None]:
%run ./../data/load-tokenized-dataset.ipynb
%run ./../utils/_callbacks.ipynb
%run ./../word2vec/_load-w2v-model.ipynb
%matplotlib inline

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, SpatialDropout1D, Masking, Embedding, LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
ITERS = 5

SAMPLE_LEN = 1024

TOP_WORDS = None
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=TOP_WORDS)

RANDOM_SEED = 0
tf.random.set_seed(RANDOM_SEED)
#tf.config.experimental.enable_op_determinism()

In [None]:
w2v_weights = w2v_model.syn1neg
key2index = w2v_model.wv.key_to_index

In [None]:
corpus, labels = df[[proc_doc_col, label_col]].T.values
X = pad_sequences([[key2index[token] for token in doc if token in key2index][:SAMPLE_LEN] for doc in corpus],
                  maxlen=SAMPLE_LEN, padding='post', truncating='post', value=len(dictionary))
y = labels.astype(int)

X, y = list((zip(*[(sample, label) for sample, label in zip(X, y) if len(sample) > 0])))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.3, random_state=RANDOM_SEED)

X_train, X_val, X_test = tf.constant(X_train), tf.constant(X_val), tf.constant(X_test)
y_train, y_val, y_test = tf.constant(y_train), tf.constant(y_val), tf.constant(y_test)

In [None]:
model = Sequential([
    Input(shape=(None, ), dtype=tf.int32),          
    Masking(mask_value=len(dictionary)),
    Embedding(input_dim=len(dictionary) + 1, output_dim=512, trainable=True,
              weights=[np.append(w2v_weights, [tf.keras.initializers.random_uniform(seed=RANDOM_SEED)
                                               (shape=[w2v_model.vector_size, ])], axis=0)]),
    SpatialDropout1D(0.2),
    LSTM(512, activation=tf.nn.tanh, recurrent_activation=tf.nn.sigmoid),
    Dense(128, activation=tf.nn.swish),
    Dropout(0.2),
    Dense(1, activation=tf.nn.sigmoid)
])
model.summary()

In [None]:
model.compile(optimizer=RMSprop(1e-3), loss='binary_crossentropy', metrics=['binary_accuracy'])

test_hist = KerasEpochCallback(end_func=model.evaluate, end_args=(X_test, y_test), end_kwargs={'verbose': False})
train_hist = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                       epochs=ITERS, batch_size=16, shuffle=True, callbacks=[test_hist])

train_metrics = np.array(list(zip(train_hist.history['loss'], train_hist.history['binary_accuracy'])))
val_metrics = np.array(list(zip(train_hist.history['val_loss'], train_hist.history['val_binary_accuracy'])))
test_metrics = np.array(test_hist.end_results)

results = [{"Training Loss": train_loss, "Training Accuracy": train_acc,
            "Validation Loss": val_loss, "Validation Accuracy": val_acc,
            "Test Loss": test_loss, "Test Accuracy": test_acc} 
           for (train_loss, train_acc), (val_loss, val_acc), (test_loss, test_acc) 
           in zip(train_metrics, val_metrics, test_metrics)]

results_df = pd.DataFrame(results)
results_df.index += 1
results_df.style.highlight_min(subset=["Training Accuracy", "Validation Accuracy", "Test Accuracy"], color='lightcoral') \
                .highlight_max(subset=["Training Accuracy", "Validation Accuracy", "Test Accuracy"], color='lightgreen') \
                .highlight_min(subset=["Training Loss", "Validation Loss", "Test Loss"], color='lightgreen') \
                .highlight_max(subset=["Training Loss", "Validation Loss", "Test Loss"], color='lightcoral')

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[0], marker='o', c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[0], marker='o', c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[0], marker='o', c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Loss")
plt.tight_layout()

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[1], marker='o', c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[1], marker='o', c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[1], marker='o', c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Accuracy")
plt.tight_layout()

In [None]:
true_preds = model.predict(X_test).flatten()
y_preds = true_preds.round()
ConfusionMatrixDisplay.from_predictions(y_preds, y_test, normalize='true', colorbar=False,
                                        cmap=plt.cm.Blues, display_labels=("reliable", "unreliable"))