In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

In [2]:
!wget --no-check-certificate \
    -O /tmp/sentiment.csv https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P

--2020-06-25 01:19:40--  https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P
Resolving drive.google.com (drive.google.com)... 172.217.214.101, 172.217.214.138, 172.217.214.139, ...
Connecting to drive.google.com (drive.google.com)|172.217.214.101|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/r5o3pb323gf6c41fqrkujt96l9ndmefm/1593047925000/11118900490791463723/*/13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P [following]
--2020-06-25 01:19:41--  https://doc-08-ak-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/r5o3pb323gf6c41fqrkujt96l9ndmefm/1593047925000/11118900490791463723/*/13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P
Resolving doc-08-ak-docs.googleusercontent.com (doc-08-ak-docs.googleusercontent.com)... 74.125.124.132, 2607:f8b0:4001:c14::84
Connecting to doc-08-ak-docs.googleusercontent.com (doc-08-ak-docs.googleusercontent.com)|74

In [3]:
dataset = pd.read_csv('/tmp/sentiment.csv')
dataset

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,So there is no way for me to plug it in here i...,0
1,1,Good case Excellent value.,1
2,2,Great for the jawbone.,1
3,3,Tied to charger for conversations lasting more...,0
4,4,The mic is great.,1
...,...,...,...
1987,1987,I think food should have flavor and texture an...,0
1988,1988,Appetite instantly gone.,0
1989,1989,Overall I was not impressed and would not go b...,0
1990,1990,The whole experience was underwhelming and I t...,0


In [4]:
sentences = dataset['text'].tolist()
labels = dataset['sentiment'].tolist()

# Seperate  the training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [5]:
vocab_size = 1000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>" # out of vocab word (a new word that is not in your word "list")


print("The training sentences \n {} \n".format(training_sentences))
# the tokenizer is an obj that can tokenize every work in the input text
tokenizer = Tokenizer(num_words= vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

# the words in our word list
word_index = tokenizer.word_index
print("Our word list or I should say map \n {} \n ".format(word_index))


#sequences maintain the order the words appeared in, for example
# saying Hi I'm Mohammed makes sense but Mohammed Hi I'm doesnt, so order is 
# important to the logical sense of the text, so we maintain it with the seq obj
sequences = tokenizer.texts_to_sequences(training_sentences)
print("The sequences with the order maintained \n {}".format(sequences))

The training sentences 

Our word list or I should say map 
 
The sequences with the order maintained 
 [[26, 68, 7, 63, 173, 13, 67, 8, 218, 5, 16, 82, 16, 2, 197, 363, 4, 75, 109, 6, 1], [20, 90, 76, 364], [19, 13, 2, 613], [1, 8, 137, 13, 614, 810, 94, 70, 811, 130, 1, 198], [2, 505, 7, 19], [4, 23, 8, 1, 2, 218, 8, 91, 5, 8, 422, 56, 131, 8, 91, 423, 199], [46, 24, 23, 238, 1, 85, 238, 1, 615, 125, 1, 2, 616, 11, 506, 365, 11, 116, 39, 109, 39], [46, 24, 28, 424, 1, 24, 322, 23, 9], [1, 8, 174, 4, 812, 12, 119], [95, 6, 141, 11, 119, 3, 37], [3, 2, 52, 36, 7, 19], [148, 10, 17, 219, 77, 157, 57, 2, 366, 49, 8, 2, 813, 49], [46, 2, 149, 64, 1, 109, 6, 1, 186, 1, 4, 425, 8, 814, 1, 617, 3, 1, 52, 57, 2, 47], [17, 20, 36, 367], [2, 220, 7, 17, 1, 32, 2, 74, 507, 7, 14, 17, 142, 40, 34], [221, 80, 13, 96, 39, 289, 58, 6, 508, 815, 15], [4, 816, 323, 105, 14, 31, 1], [26, 143, 26, 20], [48, 19], [5, 1, 200, 33, 16, 6, 173, 22, 222, 24, 1, 158, 175, 22, 1, 44, 201], [4, 159, 21, 1, 618, 

In [6]:
# now we need all sequences to be the same length, not that every word is a seq
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences
# the link above has info for the function we will use to make seqs same length

padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, 
                               padding=padding_type, truncating=trunc_type)

In [7]:
# lets check if everything is padded correctly


# def decode_review(text):
#     return ' '.join([reverse_word_index.get(i, '?') for i in text])
# print(decode_review(padded[1]))


print(training_sentences[1])
print(padded[1])

Good case Excellent value.
[ 20  90  76 364   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [8]:
# Build a sentiment network
# the output is 0 or 1 for if the review is positve or negative
# the first layer is the embedding layer

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           16000     
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 9606      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 25,613
Trainable params: 25,613
Non-trainable params: 0
_________________________________________________________________


In [9]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f988737b6d8>

In [10]:
'''
The code below will download two files for visualizing how your network "sees" the sentiment related to each word. Head to http://projector.tensorflow.org/ and load these files, then click the "Sphereize" checkbox.''
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

import io

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')
  '''

'\nThe code below will download two files for visualizing how your network "sees" the sentiment related to each word. Head to http://projector.tensorflow.org/ and load these files, then click the "Sphereize" checkbox.\'\'\n# First get the weights of the embedding layer\ne = model.layers[0]\nweights = e.get_weights()[0]\nprint(weights.shape) # shape: (vocab_size, embedding_dim)\n\nimport io\n\n# Write out the embedding vectors and metadata\nout_v = io.open(\'vecs.tsv\', \'w\', encoding=\'utf-8\')\nout_m = io.open(\'meta.tsv\', \'w\', encoding=\'utf-8\')\nfor word_num in range(1, vocab_size):\n  word = reverse_word_index[word_num]\n  embeddings = weights[word_num]\n  out_m.write(word + "\n")\n  out_v.write(\'\t\'.join([str(x) for x in embeddings]) + "\n")\nout_v.close()\nout_m.close()\n\n\n# Download the files\ntry:\n  from google.colab import files\nexcept ImportError:\n  pass\nelse:\n  files.download(\'vecs.tsv\')\n  files.download(\'meta.tsv\')\n  '

In [11]:
# now we predict!!

fake_reviews = ['I love this phone', 'I hate spaghetti', 
                'Everything was cold',
                'Everything was hot exactly as I wanted', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
                'does not work when I stand on my head']

# Create the sequences
padding_type='post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           


classes = model.predict(fakes_padded)


# The closer the class is to 1, the more positive the review is deemed to be
for x in range(len(fake_reviews)):
  print(fake_reviews[x])
  print(classes[x])
  print('\n')

I love this phone
[0.9598759]


I hate spaghetti
[0.08962569]


Everything was cold
[0.28572112]


Everything was hot exactly as I wanted
[0.4693075]


Everything was green
[0.39093253]


the host seated us immediately
[0.46338108]


they gave us free chocolate cake
[0.8241615]


not sure about the wilted flowers on the table
[0.02574179]


only works when I stand on tippy toes
[0.8365216]


does not work when I stand on my head
[0.01609531]




[followed this colab](https://colab.research.google.com/github/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l09c04_nlp_embeddings_and_sentiment.ipynb#scrollTo=g-Q6ALywmWVz)