## Import

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Functions

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

## Split the Dataset

Amazon and Yelp reviews, with their related sentiment (1 for positive, 0 for negative).\
https://www.kaggle.com/marklvl/sentiment-labelled-sentences-data-set

downloaded: reviews_sentiment.csv

In [None]:
!wget --no-check-certificate -O /tmp/sentiment.csv https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P

In [None]:
csv = 'C:/Users/bruce/Desktop/GitHub/TensorFlow/download/reviews_sentiment.csv'

# extract
dataset = pd.read_csv(csv)
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

# Separate out the sentences and labels into training and test sets
num_sentences = len(sentences)
training_size = int(num_sentences * 0.8)
test_size = num_sentences - training_size
print("number of sentences:", num_sentences)
print("number of trains:", training_size)
print("number of tests:", test_size)

# split
training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

## Tokenize the data

In [None]:
# paras
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(len(word_index))
print(word_index)

# training set
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_sequences = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# testing set
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_sequences = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

## Review the data

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(training_sequences[1]))
print(training_sentences[1])

## Create a Model with Embeddings

In [None]:
# Build a basic sentiment network
# Note the embedding layer is first, 
# and the output is only 1 node as it is either 0 or 1 (negative or positive)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

## Train the Model

In [None]:
epochs = 10
history = model.fit(training_sequences, training_labels, epochs=epochs, validation_data=(testing_sequences, testing_labels))

## Visualize the training graph

In [None]:
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

## Visualize the network

Head to http://projector.tensorflow.org/ and load these files, then click the "Sphereize" checkbox.\
vectors (vecs.tsv)\
metadata (meta.tsv)

In [None]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [None]:
# Download the files
try:
  from google.colab import files
except ImportError:
  print("ImportError")
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')
  print("Downloaded")

## Prediction

In [None]:
# Use the model to predict a review   
new_reviews = ['I love this phone', 'I hate spaghetti', 
                'Everything was cold',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
                'does not work when I stand on my head']

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(new_reviews)
sample_sequences = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)            

# Predict
classes = model.predict(sample_sequences)

# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(new_reviews)):
  print(new_reviews[x], classes[x])