# Get TWSS data

In [None]:
!mkdir -p /opt/data/
!wget -q https://github.com/tansaku/twss/archive/refs/heads/master.zip -O /opt/data/twss.zip
!unzip -q /opt/data/twss.zip -d /opt/data/

In [None]:
!ls /opt/data/twss-master/data/

fml.txt		     svm_model.pk	  twssSample1.txt     usaquotes.txt.pk
fml.txt.pk	     test.pk		  twssSample2.txt     vocab.pk
README.txt	     tfln.onesent.txt	  twssstories.txt     vocab.txt
sentenceSample1.txt  tfln.onesent.txt.pk  twssstories.txt.pk
sentenceSample2.txt  train.pk		  usaquotes.txt


# Organize the files in 'pos' and 'neg' directory

In [None]:
!mkdir -p /opt/data/pos/
!mkdir -p /opt/data/neg/
!cp /opt/data/twss-master/data/twssstories.txt /opt/data/pos/
!cp /opt/data/twss-master/data/fml.txt /opt/data/neg/
!cp /opt/data/twss-master/data/tfln.onesent.txt /opt/data/neg/
!cp /opt/data/twss-master/data/usaquotes.txt /opt/data/neg/
!ls /opt/data/pos/
!ls /opt/data/neg/

twssstories.txt
fml.txt  tfln.onesent.txt  usaquotes.txt


# Read data as samples and targets

In [None]:
import os
import numpy as np

TWSS_DIR = '/opt/data/'

labels = []
texts = []

# Change is_balanced to True for all preprocessing except imbalanced training data
is_balanced = True

for label_type in ['neg', 'pos']:
  dir_name = os.path.join(TWSS_DIR, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname), encoding='latin1')
      for i, line in enumerate(f.readlines()):
        # Uncomment for balanced data
        if i >= 669 and label_type == 'neg' and is_balanced:
          texts.append(line.rstrip())
          labels.append(2)
          continue
        texts.append(line.rstrip())
        if label_type == 'neg':
          labels.append(0)
        else:
          labels.append(1)
      f.close()

In [None]:
print("Total Samples:")
print("Balanced:", is_balanced)
print(len(texts))
print(len(labels))

Total Samples:
Balanced: True
19863
19863


# Pick one of the class balance:

# Preprocessing with balanced training data

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 100
MAX_WORDS = 15000
CLASS_BALANCE = 'Balanced'

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# print(sequences[:5])
# for s in sequences[:5]:
#   print(len(s))

# print([*word_index][:5])

all_data = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
all_labels = np.asarray(labels)

data_indices = [index for index, element in enumerate(all_labels) if element == 0 or element == 1]

data = all_data[np.asarray(data_indices)]
labels = all_labels[np.asarray(data_indices)]

# Shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Train-test split
TEST_SPLIT = 0.8
N = round(len(data) * TEST_SPLIT)

train_X = data[:N]
train_Y = labels[:N]
test_X = data[N:]
test_Y = labels[N:]

# Verify the shape of the data
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

# print(data[:5])
# for s in data[:5]:
#   print(len(s))

# print(labels[:5])

(3227, 100)
(3227,)
(807, 100)
(807,)


In [None]:
print("Neg training samples count:", np.count_nonzero(train_Y==0))
print("Pos training samples count:", np.count_nonzero(train_Y==1))

print("Neg test samples count:", np.count_nonzero(test_Y==0))
print("Pos test samples count:", np.count_nonzero(test_Y==1))

print("Number of tokens: ", len(word_index))

Neg training samples count: 1617
Pos training samples count: 1610
Neg test samples count: 390
Pos test samples count: 417
Number of tokens:  20758


# Preprocessing with imbalanced training data but balanced test data

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 100
MAX_WORDS = 15000
CLASS_BALANCE = 'Only Test Balanced'

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# print(sequences[:5])
# for s in sequences[:5]:
#   print(len(s))

# print([*word_index][:5])

all_data = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
all_labels = np.asarray(labels)

data_indices = [index for index, element in enumerate(all_labels) if element == 0 or element == 1]
extra_data_indices = [index for index, element in enumerate(all_labels) if element == 2]

data = all_data[np.asarray(data_indices)]
extra_data = all_data[np.asarray(extra_data_indices)]
print("Extra Data Length:", extra_data.shape)

labels = all_labels[np.asarray(data_indices)]
extra_labels = all_labels[np.asarray(extra_data_indices)]
print("Extra Data Labels Length:", extra_labels.shape)

# Change temp label '2' back to '0'
extra_labels = np.where(extra_labels == 2, 0, extra_labels)

# Shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Train-test split
TRAIN_SPLIT = 0.9
N = round(TRAIN_SPLIT * data.shape[0])

train_X = data[:N, :]
train_Y = labels[:N]
test_X = data[N:, :]
test_Y = labels[N:]

# Pull the last 10% as a validation set
# N = round(TRAIN_SPLIT * train_X.shape[0])
# val_X = train_X[N:, :]
# val_Y = train_Y[N:]
# train_X = train_X[:N, :]
# train_Y = train_Y[:N]

# Verify the shape of the data
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)
# print(val_X.shape)
# print(val_Y.shape)

train_X = np.concatenate((train_X, extra_data))
train_Y = np.concatenate((train_Y, extra_labels))

print(train_X.shape)
print(train_Y.shape)

# print(data[:5])
# for s in data[:5]:
#   print(len(s))

# print(labels[:5])

Extra Data Length: (15829, 100)
Extra Data Labels Length: (15829,)
(3631, 100)
(3631,)
(403, 100)
(403,)
(19460, 100)
(19460,)


In [None]:
print("Neg training samples count:", np.count_nonzero(train_Y==0))
print("Pos training samples count:", np.count_nonzero(train_Y==1))

print("Neg test samples count:", np.count_nonzero(test_Y==0))
print("Pos test samples count:", np.count_nonzero(test_Y==1))

# print("Neg val samples count:", np.count_nonzero(val_Y ==0))
# print("Pos val samples count:", np.count_nonzero(val_Y==1))

print("Number of tokens: ", len(word_index))

Neg training samples count: 17639
Pos training samples count: 1821
Neg test samples count: 197
Pos test samples count: 206
Number of tokens:  20758


# Preprocessing with imbalanced training data

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 100
MAX_WORDS = 15000
CLASS_BALANCE = 'Imbalanced'

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# print(sequences[:5])
# for s in sequences[:5]:
#   print(len(s))

# print([*word_index][:5])

data = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
labels = np.asarray(labels)

# Shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Train-test split
TEST_SPLIT = 0.8
N = round(len(data) * TEST_SPLIT)

train_X = data[:N]
train_Y = labels[:N]
test_X = data[N:]
test_Y = labels[N:]

# Verify the shape of the data
print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)
print(test_Y.shape)

# print(data[:5])
# for s in data[:5]:
#   print(len(s))

# print(labels[:5])

(15890, 100)
(15890,)
(3973, 100)
(3973,)


In [None]:
print("Neg training samples count:", np.count_nonzero(train_Y==0))
print("Pos training samples count:", np.count_nonzero(train_Y==1))

print("Neg test samples count:", np.count_nonzero(test_Y==0))
print("Pos test samples count:", np.count_nonzero(test_Y==1))

print("Number of tokens: ", len(word_index))

Neg training samples count: 14272
Pos training samples count: 1618
Neg test samples count: 3564
Pos test samples count: 409
Number of tokens:  20758


# Setup GloVe Embeddings

In [None]:
# Get the GloVe embeddings
!mkdir -p /opt/data/
!wget http://nlp.stanford.edu/data/glove.6B.zip -O /opt/data/glove.6B.zip
!unzip /opt/data/glove.6B.zip -d /opt/data/

--2021-05-12 17:58:11--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-05-12 17:58:11--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-05-12 17:58:12--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘/opt/data/glove.6B.zip

In [None]:
!ls /opt/data/

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip  pos		 twss.zip
glove.6B.200d.txt  glove.6B.50d.txt   neg	    twss-master


In [None]:
import os

BASE_DIR = '/opt/data/'
EMBEDDING_DIM = 300
GLOVE_DIR = os.path.join(BASE_DIR, f'glove.6B.{EMBEDDING_DIM}d.txt')

embeddings_index = {}
with open(GLOVE_DIR) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
# embeddings_index['apple']

# Encode TWSS words into embedding vectors

In [None]:
# Use all the all the words in the training data (Optional)
MAX_WORDS = len(word_index)

In [None]:
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
  if i < MAX_WORDS:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

embedding_matrix.shape

(15000, 300)

# Define helper functions

In [None]:
def print_results(results, name):
  print(f"\n{name} Results")
  print('-' * (len(name) + 1 + len("Results")))
  print(f"Loss: {results[0]:.2%}")
  print(f"Precision: {results[1]:.2%}")
  print(f"Recall: {results[2]:.2%}")
  print(f"AUC: {results[3]:.2%}")
  print(f"Accuracy: {results[4]:.2%}")

# Train the model

In [None]:
BATCH_SIZE = 32
EPOCHS = 10
PRETRAINED_GLOVE = True
MASKING = True
RNN_TYPE = 'GRU'
NEURAL_UNITS = 128
DROPOUT = 0.2
OPTIMIZER = 'Adam'
LEARNING_RATE = 0.001

In [None]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout
from keras.metrics import Precision, Recall, BinaryAccuracy, AUC
from keras.layers import LSTM, GRU, Bidirectional, Masking
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

model = Sequential()

if RNN_TYPE == 'None':
  model.add(Embedding(MAX_WORDS, 50, mask_zero=MASKING, input_length=MAX_LEN))
  model.add(Flatten())
  model.add(Dense(64, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
else:
  model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, mask_zero=MASKING))
  model.add(GRU(NEURAL_UNITS, dropout=DROPOUT))
  # model.add(Bidirectional(GRU(NEURAL_UNITS, dropout=DROPOUT)))
  model.add(Dense(1, activation='sigmoid'))

if PRETRAINED_GLOVE:
  model.layers[0].set_weights([embedding_matrix])
  model.layers[0].trainable = False

model.summary()
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=[Precision(), Recall(), AUC(), BinaryAccuracy()])
es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=3)

# For imbalanced training data but balanced test and validation data
# history = model.fit(train_X, train_Y, epochs=10, batch_size=32, validation_data=(val_X, val_Y))

# Validation split
# history = model.fit(train_X, train_Y, epochs=100, batch_size=32,  callbacks=[es], validation_split=0.1)
# history = model.fit(train_X, train_Y, epochs=10, batch_size=32, validation_split=0.1)

# No validation split
# history = model.fit(train_X, train_Y, epochs=100, batch_size=32,  callbacks=[es])
history = model.fit(train_X, train_Y, epochs=EPOCHS, batch_size=BATCH_SIZE)

# print(model.predict(data[:10,:]))
# print(labels[:10])
results = model.evaluate(test_X, test_Y)
print_results(results, "Test")

PRECISION = round(results[1]*100, 2)
RECALL = round(results[2]*100, 2)
AUC = round(results[3]*100, 2)
ACCURACY = round(results[4]*100, 2)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 50)           750000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                320064    
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
Total params: 1,070,129
Trainable params: 1,070,129
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Test Results
------------
Loss: 78.46%
Precision: 96.53%
Recall: 81.07%
AUC: 94.73%
Accuracy: 88.83%


In [None]:
print(history.history.keys())

# Save the results in Google Sheets

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
sh = gc.create('TWSS Results')

In [None]:
worksheet = gc.open('TWSS Results').sheet1
row_list = [MAX_WORDS, EMBEDDING_DIM, PRETRAINED_GLOVE, MASKING, RNN_TYPE, NEURAL_UNITS, DROPOUT, CLASS_BALANCE, OPTIMIZER, LEARNING_RATE, BATCH_SIZE, PRECISION, RECALL, AUC, ACCURACY]

model_entry = worksheet.insert_row(row_list, 10)

# Visualize test evaluation

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

preds_Y = model.predict(test_X)

precision, recall, thresholds = precision_recall_curve(test_Y, preds_Y)

plt.plot(recall, precision, marker='.', label='RNN Word Embedding')
plt.title(f'Precision-Recall Curve')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.savefig("best_model_pr-rc.png", dpi=256, format='png')
# show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt

hist_keys = list(history.history.keys())

def plot_acc_loss(history):
  history_dict = history.history

  acc = history_dict[hist_keys[1]]
  val_acc = history_dict[hist_keys[6]]
  loss = history_dict['loss']
  val_loss = history_dict['val_loss']

  epochs = range(1, len(acc) + 1)

  plt.plot(epochs, acc, 'bo', label=f'Accuracy')
  plt.plot(epochs, val_acc, 'r', label=f'Validation Accuracy')
  plt.title(f'Training and validation accuracy')
  plt.legend()

  # plt.savefig("best_model_acc.png", dpi=144, format='png')
  plt.figure()

  plt.plot(epochs, loss, 'bo', label='Training loss')
  plt.plot(epochs, val_loss, 'r', label='Validation loss')
  plt.title('Training and validation loss')
  plt.legend()

  # plt.savefig("best_model_loss.png", dpi=144, format='png')
  plt.show()

plot_acc_loss(history)

In [None]:
test_X.shape

# Mount Google Drive and setup paths

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import os

TWSS_MODELS_DIR = os.path.join('/content/drive', 'MyDrive', 'TWSS', 'models')

TWSS_TEST_DATA_DIR = os.path.join('/content/drive', 'MyDrive', 'TWSS', 'test_data')

# Save the model and test data

In [None]:
TWSS_MODEL_FILENAME = model_entry['updatedRange'].replace('!', '-') + '.h5'
TWSS_MODEL_PATH = os.path.join(TWSS_MODELS_DIR, TWSS_MODEL_FILENAME)
TWSS_TEST_DATA_FILENAME = model_entry['updatedRange'].replace('!', '-') + '.pkl'
TWSS_TEST_DATA_PATH = os.path.join(TWSS_TEST_DATA_DIR, TWSS_TEST_DATA_FILENAME)

model.save(TWSS_MODEL_PATH)

test_data = {
    'test_X': test_X,
    'test_Y': test_Y
}

with open(TWSS_TEST_DATA_PATH, 'wb') as f:
  pickle.dump(test_data, f)

# Load the model

In [None]:
from keras import models

best_model_names = {
    'Sheet1-A3:O3': 'GloVe w/ GRU (Train/Test Balanced)',
    'Sheet1-A4:O4': 'GloVe w/ GRU (Only Test Balanced)',
    'Sheet1-A2:O2': 'GloVe w/ GRU (Train/Test Imbalanced)',
    'Sheet1-A6:O6': 'No GloVe w/ GRU (Train/Test Imbalanced)',
    'Sheet1-A5:O5': 'No GloVe w/o GRU (Train/Test Imbalanced)'
    # 'Sheet1-A7:O7': 'Balanced No Glove RNN',
    # 'Sheet1-A8:O8': 'Balanced No Glove No RNN'
    # 'Sheet1-A9:O9': 'Val/Test No Glove',
    # 'Sheet1-A10:O10': 'Val/Test No Glove No RNN'
}

In [None]:
!mv times-new-roman.ttf /usr/share/fonts/truetype/

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.font_manager as fm

path = '/usr/share/fonts/truetype/times-new-roman.ttf'
fontprop1 = fm.FontProperties(fname=path, size=14)
fontprop2 = fm.FontProperties(fname=path, size=13)

for model_name in best_model_names.keys():
  twss_model_path = os.path.join(TWSS_MODELS_DIR, f'{model_name}.h5')
  twss_model = models.load_model(twss_model_path)
  twss_test_data_path = os.path.join(TWSS_TEST_DATA_DIR, f'{model_name}.pkl')

  with open(twss_test_data_path, 'rb') as f:
    test_data = pickle.load(f)

  preds_Y = twss_model.predict(test_data['test_X'])
  precision, recall, thresholds = precision_recall_curve(test_data['test_Y'], preds_Y)
  plt.plot(recall, precision, linestyle='solid', label=best_model_names[model_name])

# plt.title(f'Precision-Recall Curve')
# axis labels

plt.grid(linestyle='--', color='lightgray')
plt.xticks(np.arange(0, 1, 0.1), fontproperties=fontprop1)
plt.yticks(np.arange(0, 1, 0.1), fontproperties=fontprop1)
plt.xlabel('Recall', fontproperties=fontprop1)
plt.ylabel('Precision', fontproperties=fontprop1)
# show the legend
plt.legend(prop=fontprop2)
plt.savefig("precision_recall.png", dpi=256, format='png')
# show the plot
plt.show()

# Test the model

In [None]:
twss_model_path = os.path.join(TWSS_MODELS_DIR, 'Sheet1-A3:O3.h5')
model = models.load_model(twss_model_path)

In [None]:
from collections import OrderedDict

sample_twss = OrderedDict({
    "Wow! I cannot believe it. This is much bigger than I thought it would be.": 1,
    "He’s under a lot of pressure, which builds up until he’s ready to explode. It’s my job to release that pressure.": 1,
    "Please don't make it harder than it has to be.": 1,
    "I was trying all night yesterday, but I couldn't get it in.": 1,
    "I fear the people will not quietly submit to those restraints which are necessary for the peace and security of the community.": 0,
    "I am going to go home tonight and find a place to put this.": 1,
    "I will be completely honest with you, I wanted this so bad I could taste it.": 1,
    "Thank You! This is a true honour and I'm glad that I came.": 1,
    "I love eating apples, bananas and mango.": 0,
    "I love eating bananas.": 1,
    "I like banana trees.": 0,
    "I love reading books.": 0,
    "I spilled water all over the floor.": 0,
    "You really think you can go all day long.": 1,
    "Well, you always left me satisfied and smiling.": 1,
    "Well, you always left me angry.": 0,
    "Why did you get it so big?": 1,
    "Why did you get it so late?": 0,
    "Does the skin look red and swollen?": 1,
    "That thing looks red and swollen": 1,
    "That thing looks long, red and swollen.": 1,
    "You already did me!": 1,
    "I can't stay on top of you 24/7.": 1,
    "You don't need to be banging that hard.": 1,
    "You don't need to bang that hard.": 1,
    "Don't bang on the door that hard.": 0,
    "Long, hard and fast.": 1,
    "Can you grab my banana for a while?": 1,
    "Don’t you think these buns are a little too big for this meat?": 1,
    "No matter what you have heard, size matters.": 1,
    "No matter what you have heard, grade matters.": 0,
    "Do you want to play with my joystick?": 1,
    "Do you want to grab lunch?": 0,
})

sequences = tokenizer.texts_to_sequences(list(sample_twss.keys()))
twss_data = pad_sequences(sequences, maxlen=MAX_LEN)

sample_results = model.evaluate(twss_data, np.array(list(sample_twss.values())))
print_results(sample_results, "Sample Test")

predictions = model.predict(twss_data)

for i, sample in enumerate(sample_twss):
  print(f'\n{sample} => {predictions[i][0]*100:.2f}%')


Sample Test Results
-------------------
Loss: 282.00%
Precision: 44.44%
Recall: 17.39%
AUC: 29.35%
Accuracy: 27.27%

Wow! I cannot believe it. This is much bigger than I thought it would be. => 0.48%

He’s under a lot of pressure, which builds up until he’s ready to explode. It’s my job to release that pressure. => 98.04%

Please don't make it harder than it has to be. => 0.13%

I was trying all night yesterday, but I couldn't get it in. => 1.09%

I fear the people will not quietly submit to those restraints which are necessary for the peace and security of the community. => 99.83%

I am going to go home tonight and find a place to put this. => 92.56%

I will be completely honest with you, I wanted this so bad I could taste it. => 9.88%

Thank You! This is a true honour and I'm glad that I came. => 71.79%

I love eating apples, bananas and mango. => 57.28%

I love eating bananas. => 44.63%

I like banana trees. => 35.74%

I love reading books. => 77.15%

I spilled water all over the f

# Evaluate extra data (Optional)

In [None]:
print(f"Extra Data Length: {len(extra_data)}", end='\n\n')
print(model.evaluate(extra_data, extra_labels))

In [None]:
model.save('test_model.h5')