In [1]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

In [2]:
# _URL = ""

# data = tf.keras.utils.get_file(_URL)

In [3]:
data = pd.read_csv('data/combined_data.csv', index_col=0)
data.head()

Unnamed: 0,text,sentiment
0,So there is no way for me to plug it in here i...,0
1,Good case Excellent value.,1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
reviews = data['text'].tolist()

In [5]:
review_labels = data['sentiment'].tolist()


In [6]:
##split in test and train

In [7]:
split_index = int(np.ceil(len(reviews)*0.8))

train_reviews = reviews[0:split_index]
test_reviews = reviews[split_index: ]

len(train_reviews), len(test_reviews)

(1594, 398)

In [8]:
train_labels = review_labels[0 : split_index]
test_labels = review_labels[split_index : ]

len(train_labels), len(test_labels)

(1594, 398)

In [9]:
#make numpy arrays
train_labels = np.array(train_labels)
test_labels  = np.array(test_labels)

## Tokenize the reviews

In [10]:
vocab_size = 500
embedding_dim = 16
max_length = 100

In [11]:
tokenizer = Tokenizer(oov_token='<OOV>', num_words=vocab_size)

In [12]:
tokenizer.fit_on_texts(train_reviews)

In [13]:
word_index = tokenizer.word_index
print ('word_index: ' , len(word_index))

word_index:  2834


## Generate Sequence of Reviews

In [14]:
train_sequences = tokenizer.texts_to_sequences(train_reviews)

In [15]:
# train_sequences[0]

In [16]:
train_padded_sequences = pad_sequences(train_sequences, padding='post', maxlen=max_length, truncating='post')
train_padded_sequences[0]

array([ 26,  68,   7,  63, 173,  13,  67,   8, 219,   5,  16,  82,  16,
         2, 198, 364,   4,  75, 109,   6,   1,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [17]:
len(train_padded_sequences[0])

100

In [18]:
train_reviews[0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

In [19]:
train_sequences[0]

[26,
 68,
 7,
 63,
 173,
 13,
 67,
 8,
 219,
 5,
 16,
 82,
 16,
 2,
 198,
 364,
 4,
 75,
 109,
 6,
 1]

In [20]:
# test data

test_sequences = tokenizer.texts_to_sequences(test_reviews)
test_padded_sequences = pad_sequences(test_sequences, maxlen = max_length, padding='post', truncating='post')

## Create Model

In [21]:
l0 = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length)
l1 = tf.keras.layers.Flatten()

l2 = tf.keras.layers.Dense(units=6, activation='relu')

l3 = tf.keras.layers.Dense(units=2, activation='softmax')

In [22]:
model = tf.keras.Sequential([l0, l1, l2, l3])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'], optimizer='adam')

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           8000      
_________________________________________________________________
flatten (Flatten)            (None, 1600)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 9606      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 14        
Total params: 17,620
Trainable params: 17,620
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [24]:
EPOCHS = 12
model.fit(train_padded_sequences,
          train_labels,
          validation_data=(test_padded_sequences,test_labels),
          epochs=EPOCHS)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7fd3a2980520>

## Visualizing the Network

In [25]:
# tuple_list = [(value, key) for (key, value) in word_index.items()]

# reverse_word_index = dict(tuple_list)

reverse_word_index  = {}
for key, value in word_index.items():
    reverse_word_index[value] = key


In [26]:
embeddig_weights = l0.get_weights()[0]
embeddig_weights.shape

(500, 16)

In [27]:
import io

# embedding vectors and meta-data
out_v = io.open('vec.tsv', 'w', encoding='utf-8')  # contains weigh vector of each word
out_m = io.open('meta.tsv', 'w', encoding='utf-8') # contain word


In [28]:
#tab delimited 
'\t'.join([str(x) for x in embeddig_weights[1]])

'-0.013115052\t-0.03802334\t-0.029480202\t-0.014087418\t0.0044238367\t0.07983513\t-0.04049583\t0.036401916\t0.010341115\t0.0065676398\t0.089937516\t0.03852202\t0.0013121986\t-0.04855585\t0.03795019\t-0.061724033'

In [29]:
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embedding = embeddig_weights[word_num]
    
    out_m.write(word+"\n")
    out_v.write('\t'.join([str(x) for x in embeddig_weights[1]]) + "\n")

out_v.close()
out_m.close()

## Predicting Sentiments

In [30]:
fake_reviews =['I love chocolate', 
              'This restaurant sucks',
              'OMG, what a shake!', 
              'Totally recommend this place for grilled sandwitches', 
              'I would lose my license instead of standing in long lines', 
               'please never come to this fish market, it smells too bad']

fake_reviews

['I love chocolate',
 'This restaurant sucks',
 'OMG, what a shake!',
 'Totally recommend this place for grilled sandwitches',
 'I would lose my license instead of standing in long lines',
 'please never come to this fish market, it smells too bad']

In [31]:
fake_seq = tokenizer.texts_to_sequences(fake_reviews)
fake_pad_seq = pad_sequences(fake_seq, maxlen=max_length, padding='post', truncating='post')

In [32]:
classes = model.predict(fake_pad_seq)
            

In [33]:
for i in range(0, len(fake_reviews)):
    print(fake_reviews[i])
    print(classes[i])
    print('\n')

I love chocolate
[0.00175381 0.9982462 ]


This restaurant sucks
[0.7365322  0.26346776]


OMG, what a shake!
[0.23255609 0.76744384]


Totally recommend this place for grilled sandwitches
[0.02220836 0.9777916 ]


I would lose my license instead of standing in long lines
[0.16695921 0.8330408 ]


please never come to this fish market, it smells too bad
[0.9921681  0.00783186]


