In [None]:
%run ./../data/load-dataset.ipynb
%run ./../various/_epoch-callback.ipynb
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, GRU, Bidirectional, Dense, Dropout
from tensorflow.keras.regularizers import l1, l2, l1_l2

In [None]:
ITERS = 10

TOP_WORDS = None
dictionary.filter_extremes(no_below=1, no_above=max(dictionary.cfs.values()), keep_n=TOP_WORDS)

RANDOM_SEED = 0
tf.keras.utils.set_random_seed(RANDOM_SEED)

In [None]:
corpus, labels = df[[proc_doc_col, label_col]].T.values
X = [[idx for idx in dictionary.doc2idx(doc) if idx != -1] for doc in corpus]
y = labels.astype(int)
X, y = list((zip(*[(sample, label) for sample, label in zip(X, y) if len(sample) > 0])))

sample_len = _ if (_ := int(np.median([len(doc) for doc in X]).round())) % 100 == 0 else _ + 100  - (_ % 100)
X = [doc[:sample_len] for doc in X]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=RANDOM_SEED)
X_train, X_val, X_test = tf.ragged.constant(X_train), tf.ragged.constant(X_val), tf.ragged.constant(X_test)
y_train, y_val, y_test = tf.constant(y_train), tf.constant(y_val), tf.constant(y_test)

In [None]:
model = Sequential([
    Input(shape=[None], ragged=True, dtype=tf.int32),
    Embedding(input_dim=len(dictionary), output_dim=100, embeddings_regularizer=l2(1e-3)),
    LSTM(100),
    Dense(1, activation=tf.nn.sigmoid)
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

val_hist = KerasEpochCallback(end_func=model.evaluate, end_args=(X_val, y_val), end_kwargs={'verbose': False})
test_hist = KerasEpochCallback(end_func=model.evaluate, end_args=(X_test, y_test), end_kwargs={'verbose': False})

model.summary()

In [None]:
train_hist = model.fit(X_train, y_train, epochs=ITERS, batch_size=16, callbacks=[val_hist, test_hist])
train_metrics = np.array(list(zip(train_hist.history['loss'], train_hist.history['binary_accuracy'])))
val_metrics = np.array(val_hist.end_results)
test_metrics = np.array(test_hist.end_results)

results = [{'Training Loss': train_loss, 'Training Accuracy': train_acc,
            'Validation Loss': val_loss, 'Validation Accuracy': val_acc,
            'Test Loss': test_loss, 'Test Accuracy': test_acc} 
           for (train_loss, train_acc), (val_loss, val_acc), (test_loss, test_acc) 
           in zip(train_metrics, val_metrics, test_metrics)]

results_df = pd.DataFrame(results)
results_df.index += 1
results_df[['Training Loss', 'Training Accuracy', 
            'Validation Loss', 'Validation Accuracy', 
            'Test Loss', 'Test Accuracy']].style \
                .highlight_min(subset=['Training Accuracy', 'Validation Accuracy', 'Test Accuracy'],
                               color='lightcoral') \
                .highlight_max(subset=['Training Accuracy', 'Validation Accuracy', 'Test Accuracy'],
                               color='lightgreen') \
                .highlight_min(subset=['Training Loss', 'Validation Loss', 'Test Loss'], 
                               color='lightgreen') \
                .highlight_max(subset=['Training Loss', 'Validation Loss', 'Test Loss'], 
                               color='lightcoral')

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[0], c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[0], c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[0], c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Loss")
plt.tight_layout()

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[1], c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[1], c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[1], c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Accuracy")
plt.tight_layout()

In [None]:
true_preds = model.predict(X_test).flatten()
y_preds = true_preds.round()
ConfusionMatrixDisplay.from_predictions(y_preds, y_test, normalize='true', colorbar=False,
                                        cmap=plt.cm.Blues, display_labels=('reliable', 'unreliable'))