## Tips to tweak

- Data and preprocessing-based approaches
  - More data
  - Adjusting vocabulary size (make sure to consider the overall size of the corpus!)
  - Adjusting sequence length (more or less padding or truncation)
  - Whether to pad or truncate `pre` or `post` (usually less of an effect than the others)
- Model-based approaches
  - Adjust the number of embedding dimensions
  - Changing use of `Flatten` vs. `GlobalAveragePooling1D` (often better)
  - Considering other layers like Dropout
  - Adjusting the number of nodes in intermediate fully-connected layers

## Import

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Functions

In [None]:
def plot_graphs(title, history, string):
  plt.title(title)
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

def predict_review(model, new_sentences, tokenizer, maxlen=100, show_padded_sequence=False, trunc_type='post', padding_type='post'):
  # Keep the original sentences so that we can keep using them later
  # Create an array to hold the encoded sequences
  new_sequences = []

  # Convert the new reviews to sequences
  for i, frvw in enumerate(new_sentences):
    new_sequences.append(tokenizer.encode(frvw))

  # Pad all sequences for the new reviews
  new_reviews_padded = pad_sequences(new_sequences, maxlen=maxlen, padding=padding_type, truncating=trunc_type)

  classes = model.predict(new_reviews_padded)

  # The closer the class is to 1, the more positive the review is
  for x in range(len(new_sentences)):
    
    # We can see the padded sequence if desired
    # Print the sequence
    if (show_padded_sequence):
      print(new_reviews_padded[x])

    print(new_sentences[x], classes[x])

def train_model(model, training_sequences, testing_sequences, training_labels, testing_labels, epochs=30):
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  history = model.fit(training_sequences, training_labels, epochs=epochs, validation_data=(testing_sequences, testing_labels))
  return history

def plot_results(title, history):
  plot_graphs(title, history, "accuracy")
  plot_graphs(title, history, "loss")

def export_files(string, model, vocab_size, tokenizer):
  # First get the weights of the embedding layer
  e = model.layers[0]
  weights = e.get_weights()[0]
  print(weights.shape) # shape: (vocab_size, embedding_dim)

  # Write out the embedding vectors and metadata
  out_v = io.open(string+'_vecs.tsv', 'w', encoding='utf-8')
  out_m = io.open(string+'_meta.tsv', 'w', encoding='utf-8')
  for word_num in range(0, vocab_size - 1):
    word = tokenizer.decode([word_num])
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
  out_v.close()
  out_m.close()

  # Download the files
  try:
    from google.colab import files
  except ImportError:
    print("ImportError")
    pass
  else:
    files.download('vecs.tsv')
    files.download('meta.tsv')
    print("Downloaded")

## Tokenize and Encode subwords

Amazon and Yelp reviews, with their related sentiment (1 for positive, 0 for negative).\
https://www.kaggle.com/marklvl/sentiment-labelled-sentences-data-set

downloaded: reviews_sentiment.csv

In [None]:
!wget --no-check-certificate -O /tmp/sentiment.csv https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P

In [None]:
csv = 'C:/Users/bruce/Desktop/GitHub/TensorFlow/download/reviews_sentiment.csv'

# extract
dataset = pd.read_csv(csv)
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

# create a subwords dataset with tokenizer
vocab_size = 1000
max_subword_length = 5
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(sentences, vocab_size, max_subword_length=max_subword_length)

# Check that the tokenizer works appropriately
num = 5
print(sentences[num])
encoded = tokenizer.encode(sentences[num])
print(encoded)
# Separately print out each subword, decoded
for i in encoded:
  print(tokenizer.decode([i]))

# Replace sentence data with encoded subwords
# This is equivalent to `text_to_sequences` with the `Tokenizer`
for i, sentence in enumerate(sentences):
  sentences[i] = tokenizer.encode(sentence)
# Check the sentences are appropriately replaced
print(sentences[1])

## Padding and Split the data

In [None]:
# paras
embedding_dim = 16
max_length = 50 # 100
trunc_type='post'
padding_type='post'

# Separate out the sentences and labels into training and test sets
split_ratio = 0.8
num_sentences = len(sentences)
training_size = int(num_sentences * split_ratio)
test_size = num_sentences - training_size
print("number of sentences:", num_sentences)
print("number of trains:", training_size)
print("number of tests:", test_size)

# Pad all sentences
sentences = pad_sequences(sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# split
training_sequences = sentences[:training_size]
testing_sequences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

## Create Models

In [None]:
# Simple Embeddings
model_simple = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Bidirectional LSTM
model_bidi_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)), 
    tf.keras.layers.Dense(6, activation='relu'), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Multiple Bidirectional LSTM
model_multiple_bidi_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


## Predict data

In [None]:
# Use the model to predict some reviews   
new_reviews = ["I love this phone", 
                "Everything was cold",
                "Everything was hot exactly as I wanted", 
                "Everything was green", 
                "the host seated us immediately",
                "they gave us free chocolate cake", 
                "we couldn't hear each other talk because of the shouting in the kitchen",
                "lovely",
                "dreadful",
                "stay away",
                "everything was hot exactly as I wanted",
                "everything was not exactly as I wanted",
                "they gave us free chocolate cake",
                "I've never eaten anything so spicy in my life, my throat burned for hours",
                "for a phone that is as expensive as this one I expect it to be much easier to use than this thing is",
                "we left there very full for a low price so I'd say you just can't go wrong at this place",
                "that place does not have quality meals and it isn't a good place to go for dinner",
              ]

## Train the models and Predict

In [None]:
history = train_model(model_simple, training_sequences, testing_sequences, training_labels, testing_labels)
plot_results("Simple Embeddings", history)
predict_review(model_simple, new_reviews, tokenizer, maxlen=max_length)

history = train_model(model_bidi_lstm, training_sequences, testing_sequences, training_labels, testing_labels)
plot_results("Bi-LSTM", history)
predict_review(model_bidi_lstm, new_reviews, tokenizer, maxlen=max_length)

history = train_model(model_multiple_bidi_lstm, training_sequences, testing_sequences, training_labels, testing_labels)
plot_results("multi-Bi-LSTM", history)
predict_review(model_multiple_bidi_lstm, new_reviews, tokenizer, maxlen=max_length)

## Visualize the network

Head to http://projector.tensorflow.org/ and load these files, then click the "Sphereize" checkbox.\
vectors (vecs.tsv)\
metadata (meta.tsv)

In [None]:
export_files("simple_embed", model_simple, vocab_size=vocab_size, tokenizer=tokenizer)
export_files("bidi_lstm", model_bidi_lstm, vocab_size=vocab_size, tokenizer=tokenizer)
export_files("multi_bidi_lstm", model_multiple_bidi_lstm, vocab_size=vocab_size, tokenizer=tokenizer)