In [None]:
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import Callback, EarlyStopping

In [None]:
from dvclive.keras import DVCLiveCallback
from dvclive import Live

live = Live(save_dvc_exp=True)

In [None]:
import pathlib
 
# current working directory
print(pathlib.Path().absolute())

In [None]:
from sorcery import (assigned_names, unpack_keys, unpack_attrs,
                     dict_of, print_args, call_with_name,
                     delegate_to_attr, maybe, select_from)

### PARAMS

In [None]:
UTILS_DIR = "./"
DATA_DIR = '../data/'
MAX_LEN = 20
EPOCHS = 100
EARLY_STOPPING_MONITOR = 'val_loss'
EARLY_STOPPING_MIN_DELTA = 0
EARLY_STOPPING_PATIENCE = 5
EARLY_STOPPING_MODE = 'auto'

# # UI friendly ??
# training_params = dict_of(MAX_LEN, EPOCHS, EARLY_STOPPING_MONITOR, EARLY_STOPPING_MIN_DELTA, EARLY_STOPPING_PATIENCE, EARLY_STOPPING_MODE)

### Added by DataScientists

### Added by DataScientists

In [None]:
sys.path.append(UTILS_DIR)

from utils.plot_target import plot_target

### Added by DataScientists

## TRAINING

In [None]:
X = pickle.load(open(DATA_DIR + 'X.pckl', 'rb'))
X_train = pickle.load(open(DATA_DIR + 'X_train.pckl', 'rb'))
X_test = pickle.load(open(DATA_DIR + 'X_test.pckl', 'rb'))

y = pickle.load(open(DATA_DIR + 'y.pckl', 'rb'))
y_train = pickle.load(open(DATA_DIR + 'y_train.pckl', 'rb'))
y_test = pickle.load(open(DATA_DIR + 'y_test.pckl', 'rb'))

In [None]:
X_train

In [None]:
counter = pickle.load(open(DATA_DIR + 'counter.pckl', 'rb'))

In [None]:
num_words = len(counter)

In [None]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(X_train, maxlen = MAX_LEN, truncating="post")

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(X_test,maxlen=MAX_LEN, truncating="post")

In [None]:
my_callbacks  = [EarlyStopping(monitor = EARLY_STOPPING_MONITOR,
                              min_delta = EARLY_STOPPING_MIN_DELTA,
                              patience =  EARLY_STOPPING_PATIENCE,
                              mode = EARLY_STOPPING_MODE),
                DVCLiveCallback(save_dvc_exp=True)] #adding DVCLive callback

In [None]:
model = Sequential()
model.add(Embedding(num_words, 32,input_length=MAX_LEN))
model.add(LSTM(64, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
history = model.fit(train_padded, y_train, epochs=EPOCHS, validation_data=(test_padded, y_test), callbacks=my_callbacks)

In [None]:
results = model.evaluate(test_padded, y_test, verbose=0)
print("Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

In [None]:
live.log_metric("test_loss", results[0])
live.log_metric("test_accuracy", results[1])

In [None]:
# y_pred = model.predict_classes(test_padded)
predict_y =model.predict(test_padded) 
y_pred=np.argmax(predict_y,axis=1)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()

for i, met in enumerate(['accuracy', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])
    plt.savefig(DATA_DIR + "model_metrics.png")

In [None]:
live.end()