In [None]:
%run ./../data/load-dataset.ipynb
%run ./../various/_epoch-callback.ipynb
%matplotlib inline

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow import SparseTensor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout

In [None]:
BINARY = False
ITERS = 10

TOP_WORDS = None
dictionary.filter_extremes(no_below=1, no_above=max(dictionary.cfs.values()), keep_n=TOP_WORDS)

RANDOM_SEED = 0
tf.keras.utils.set_random_seed(RANDOM_SEED)

In [None]:
corpus, labels = df[[proc_doc_col, label_col]].T.values
X = [dictionary.doc2bow(doc) for doc in corpus]
y = labels.astype(int)
X, y = list((zip(*[(sample, label) for sample, label in zip(X, y) if len(sample) > 0])))
train_samples, test_samples, train_labels, test_labels = train_test_split(X, y, random_state=RANDOM_SEED)

_ = {(row, col): val if not BINARY else 1 for row, sample in enumerate(train_samples) for (col, val) in sample}
train_samples = SparseTensor(indices=list(_.keys()), values=list(_.values()),
                            dense_shape=(len(train_samples), len(dictionary)))
del(_)

_ = {(row, col): val if not BINARY else 1 for row, sample in enumerate(test_samples) for (col, val) in sample}
test_samples = SparseTensor(indices=list(_.keys()), values=list(_.values()),
                            dense_shape=(len(test_samples), len(dictionary)))
del(_)

train_labels, test_labels = tf.constant(train_labels), tf.constant(test_labels)

In [None]:
model = Sequential([
    Input(shape=len(dictionary), sparse=True, dtype=tf.int32),
    Dense(128, activation=tf.nn.relu),
    Dropout(0.9),
    Dense(1, activation=tf.nn.sigmoid)
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

training_set_hist = KerasEpochCallback(end_func=model.evaluate,
                                       end_args=(train_samples, train_labels), 
                                       end_kwargs={'verbose': False})
test_set_hist = KerasEpochCallback(end_func=model.evaluate,
                                   end_args=(test_samples, test_labels), 
                                   end_kwargs={'verbose': False})
model.summary()

In [None]:
training_hist = model.fit(train_samples, train_labels, epochs=ITERS, batch_size=32,
                          callbacks=[training_set_hist, test_set_hist])

training_metrics = np.array(list(zip(training_hist.history['loss'], training_hist.history['binary_accuracy'])))
training_set_metrics = np.array(training_set_hist.end_results)
test_set_metrics = np.array(test_set_hist.end_results)

results = [{'Training Loss': training_loss, 'Training Accuracy': training_acc,
            'Training Set Loss': training_set_loss, 'Training Set Accuracy': training_set_acc,
            'Test Set Loss': test_set_loss, 'Test Set Accuracy': test_set_acc} 
           for (training_loss, training_acc), 
               (training_set_loss, training_set_acc), 
               (test_set_loss, test_set_acc) 
           in zip(training_metrics, training_set_metrics, test_set_metrics)]

results_df = pd.DataFrame(results)
results_df.index += 1
results_df[['Training Loss', 'Training Accuracy', 
            'Training Set Loss', 'Training Set Accuracy', 
            'Test Set Loss', 'Test Set Accuracy']].style \
                .highlight_min(subset=['Training Accuracy', 'Training Set Accuracy', 'Test Set Accuracy'],
                               color='lightcoral') \
                .highlight_max(subset=['Training Accuracy', 'Training Set Accuracy', 'Test Set Accuracy'],
                               color='lightgreen') \
                .highlight_min(subset=['Training Loss', 'Training Set Loss', 'Test Set Loss'], 
                               color='lightgreen') \
                .highlight_max(subset=['Training Loss', 'Training Set Loss', 'Test Set Loss'], 
                               color='lightcoral')

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), training_metrics.T[0], c='#1f77b4', label="During Training")
ax.plot(np.arange(0, ITERS), training_set_metrics.T[0], c='#d62728', label="On Training Set")
ax.plot(np.arange(0, ITERS), test_set_metrics.T[0], c='#2ca02c', label="On Test Set")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Loss")
plt.tight_layout()

In [None]:
ax = plt.gca()
ax.plot(np.arange(0, ITERS), training_metrics.T[1], c='#1f77b4', label="During Training")
ax.plot(np.arange(0, ITERS), training_set_metrics.T[1], c='#d62728', label="On Training Set")
ax.plot(np.arange(0, ITERS), test_set_metrics.T[1], c='#2ca02c', label="On Test Set")
ax.legend()
plt.xticks(np.arange(0, ITERS), np.arange(1, ITERS + 1))
plt.title("Accuracy")
plt.tight_layout()

In [None]:
true_preds = model.predict(test_samples).flatten()
test_preds = true_preds.round()
ConfusionMatrixDisplay.from_predictions(test_preds, test_labels, normalize='true', colorbar=False,
                                        cmap=plt.cm.Blues, display_labels=('reliable', 'unreliable'))