In [None]:
%run ./../data/load-dataset.ipynb
%run ./../utils/_callbacks.ipynb
%matplotlib inline

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow import SparseTensor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import SGD

In [None]:
ITERS = 10

BINARY = False

TOP_WORDS = None
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=TOP_WORDS)

RANDOM_SEED = 0
tf.keras.utils.set_random_seed(RANDOM_SEED)

In [None]:
corpus, labels = df[[proc_doc_col, label_col]].T.values
X = [dictionary.doc2bow(doc) for doc in corpus]
y = labels.astype(int)

X, y = list((zip(*[(sample, label) for sample, label in zip(X, y) if len(sample) > 0])))

if BINARY:
    X = [[(col, 1) for col, val in sample] for sample in X]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.3, random_state=RANDOM_SEED)

_ = {(row, col): val for row, sample in enumerate(X_train) for (col, val) in sample}
X_train = SparseTensor(indices=list(_.keys()), values=list(_.values()), dense_shape=(len(X_train), len(dictionary)))
del(_)

_ = {(row, col): val for row, sample in enumerate(X_val) for (col, val) in sample}
X_val = SparseTensor(indices=list(_.keys()), values=list(_.values()), dense_shape=(len(X_val), len(dictionary)))
del(_)

_ = {(row, col): val for row, sample in enumerate(X_test) for (col, val) in sample}
X_test = SparseTensor(indices=list(_.keys()), values=list(_.values()), dense_shape=(len(X_test), len(dictionary)))
del(_)

y_train, y_val, y_test = tf.constant(y_train), tf.constant(y_val), tf.constant(y_test)

In [None]:
model = Sequential([
    Input(shape=len(dictionary), sparse=True, dtype=tf.int32),
    Dense(8, activation=tf.nn.relu),
    Dropout(0.5),
    Dense(1, activation=tf.nn.sigmoid)
])
optimizer = SGD(1e-3)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['binary_accuracy'])
model.summary()

In [None]:
test_hist = KerasEpochCallback(end_func=model.evaluate, end_args=(X_test, y_test), end_kwargs={'verbose': False})
train_hist = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                       epochs=ITERS, batch_size=16, shuffle=True, callbacks=[test_hist])

train_metrics = np.array(list(zip(train_hist.history['loss'], train_hist.history['binary_accuracy'])))
val_metrics = np.array(list(zip(train_hist.history['val_loss'], train_hist.history['val_binary_accuracy'])))
test_metrics = np.array(test_hist.end_results)

results = [{"Training Loss": train_loss, "Training Accuracy": train_acc,
            "Validation Loss": val_loss, "Validation Accuracy": val_acc,
            "Test Loss": test_loss, "Test Accuracy": test_acc} 
           for (train_loss, train_acc), (val_loss, val_acc), (test_loss, test_acc) 
           in zip(train_metrics, val_metrics, test_metrics)]

results_df = pd.DataFrame(results)
results_df.index += 1
results_df.style.highlight_min(subset=["Training Accuracy", "Validation Accuracy", "Test Accuracy"], color='lightcoral') \
                .highlight_max(subset=["Training Accuracy", "Validation Accuracy", "Test Accuracy"], color='lightgreen') \
                .highlight_min(subset=["Training Loss", "Validation Loss", "Test Loss"], color='lightgreen') \
                .highlight_max(subset=["Training Loss", "Validation Loss", "Test Loss"], color='lightcoral')

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[0], c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[0], c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[0], c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Loss")
plt.tight_layout()

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), train_metrics.T[1], c='#1f77b4', label="Training")
ax.plot(np.arange(0, ITERS), val_metrics.T[1], c='#d62728', label="Validation")
ax.plot(np.arange(0, ITERS), test_metrics.T[1], c='#2ca02c', label="Test")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Accuracy")
plt.tight_layout()

In [None]:
true_preds = model.predict(X_test).flatten()
y_preds = true_preds.round()
ConfusionMatrixDisplay.from_predictions(y_preds, y_test, normalize='true', colorbar=False,
                                        cmap=plt.cm.Blues, display_labels=("reliable", "unreliable"))