In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow_hub as hub
import keras
import tensorflow as tf
import keras
from keras.metrics import categorical_accuracy
from keras import layers
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, balanced_accuracy_score
import datetime

In [None]:
df = pd.read_parquet('../data/train_pos.parquet')

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens=20000)
encoder.adapt(df.from_tokens.values)

In [None]:
encoder.vocabulary_size()

In [None]:
df = df[~(df.from_tokens == '')]

In [None]:
df.shape

In [None]:
X = df.clean.values
y = df.user_suggestion.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [None]:
model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2", dtype=tf.string, input_shape=[],
                  output_shape=[50]),
    keras.layers.Dense(128, activation='relu'),
    # tf.keras.layers.Dropout(0.4),
    # keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
    
    
])
model.layers[0].trainable=True

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
callback_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5, min_delta=0.005)

learning_drop = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.80,
    patience=2,
    verbose=0,
    mode="auto",
    min_delta=0.01,
    cooldown=0,
    min_lr=0,
)

model_dir = "models/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5'

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    model_dir, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=False, mode='auto', save_freq='epoch',
    options=None
)


In [None]:
# weights = dict(1/df.outcome.value_counts())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, 
          callbacks=[learning_drop, callback_stop, checkpoint_cb], batch_size=64)

In [None]:
model_best = keras.models.load_model('./models/20220205-190208.h5', custom_objects={'KerasLayer': hub.KerasLayer})

In [None]:
y_pred = (model_best.predict(X_test) > 0.5)
y_pred = np.squeeze(y_pred)

In [None]:
plt.style.use('dark_background')
fig = plt.figure(figsize=(8, 8))
ax = plt.gca()
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fig.savefig('confusion.png', transparent=True)

In [None]:
df