<a href="https://colab.research.google.com/github/Zilleplus/MachineLearning/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import sys

In [None]:
(X_train, y_train) , (X_test, y_test) = keras.datasets.imdb.load_data()
word_index = keras.datasets.imdb.get_word_index()

In [None]:
print(type(X_train))
print(type(word_index))

<class 'numpy.ndarray'>
<class 'dict'>


In [None]:
list(word_index.keys())[0:10] # dict containing {word, id} pairs

['fawn',
 'tsukino',
 'nunnery',
 'sonja',
 'vani',
 'woods',
 'spiders',
 'hanging',
 'woody',
 'trawling']

In [None]:
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
print(type(id_to_word)) # make the first 3 indices free to 0=padding token, 1=start-of-sequence, 2=unknown words (convention that is vaguely mentioned in the docs)

<class 'dict'>


In [None]:
for id_, token, in enumerate(("<pad>", "<sos>", "<unk>")):
  id_to_word[id_] = token

In [None]:
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

Let's do the preprocessing by hand, the keras version already has everything done.

In [None]:
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [None]:
def preprocessing(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300) # only take the first 300 chars
  X_batch = tf.strings.regex_replace(X_batch, b"<bv\\s*/?>", b" ") # remove all the breaks
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ") # remove signs except the text, and replace them by spaces
  X_batch = tf.strings.split(X_batch) # split up the sentence in words
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch

Remove the most fequently used words, as they have very little meaning.

In [None]:
# find the number of occurences of the words
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(16).map(map_func=preprocessing):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))
print("The 3 most common words are:"+str(vocabulary.most_common()[:3]))

# reduce the verb size to 10000, as rarely used words are not that usefull.
vocab_size = 10000
truncated_vocabulary = [word for word, couint in vocabulary.most_common()[:vocab_size]]

The 3 most common words are:[(b'<pad>', 176988), (b'the', 61137), (b'a', 38564)]


In [None]:
 # Create a lookup table for the out-of-vocabulary (oov) buckets
 words = tf.constant(truncated_vocabulary)
 word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
 vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
 num_oov_buckets = 10000
 table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [None]:
# word faaaaantastic was not found in the table, soit was mapped on one of the oov buckets, 
# with an id greated than or equal to 10 000
print(table.lookup(tf.constant([b"This movie was faaaaantastic".split()])))

tf.Tensor([[   23    13    12 13791]], shape=(1, 4), dtype=int64)


In [None]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch
train_set = datasets["train"].batch(32).map(preprocessing).prefetch(1)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embed_size = 128
model = keras.models.Sequential([
  keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
  keras.layers.GRU(units=128, return_sequences=True),
  keras.layers.GRU(units=128),
  keras.layers.Dense(units=1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test_data = datasets["test"].batch(16).map(preprocessing).map(encode_words).prefetch(1)
for x, y in test_data:
    tf.print(x.shape)
    y_pred = model.predict(x)
    tf.print(y_pred)
    break

TensorShape([16, 60])
array([[0.99798954],
       [0.93247104],
       [0.00501662],
       [0.01496071],
       [0.84784806],
       [0.8968861 ],
       [0.99513656],
       [0.99939847],
       [0.06936654],
       [0.00429186],
       [0.9842895 ],
       [0.01469716],
       [0.74352026],
       [0.9943054 ],
       [0.57256967],
       [0.02385691]], dtype=float32)


In [None]:
results = model.evaluate(test_data)



In [None]:
print("test loss, test acc:", results)

test loss, test acc: [0.9961671233177185, 0.7068799734115601]


In [None]:
K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(units=128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(units=128)(z, mask=mask)
outputs = keras.layers.Dense(units=1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
results = model.evaluate(test_data)



In [None]:
print("test loss, test acc:", results)

test loss, test acc: [1.1266433000564575, 0.7002400159835815]


In [None]:
import tensorflow_hub as hub

In [None]:
model = keras.Sequential([
                          hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim50/2",input_shape=[], dtype=tf.string),
                          keras.layers.Dense(units=128, activation="relu"),
                          keras.layers.Dense(units=1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
train_set = datasets["train"].batch(32).prefetch(1)
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
results = model.evaluate(datasets["test"].batch(32).prefetch(1))
print("test loss, test acc:", results)

test loss, test acc: [0.5111663341522217, 0.7476800084114075]
