In [2]:
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

In [3]:
reviews = ['nice food', 
            'amazing restaurant',
            'too good',
            'just loved it',
            'will go again!',
            'horrible food',
            'never go there',
            'poor service',
            'poor quality',
            'needs improvement']
sentiment = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [4]:
one_hot('amazing restaurant', 500)

[386, 289]

In [5]:
vocab_size = 100
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
encoded_reviews

[[44, 69],
 [38, 50],
 [79, 3],
 [16, 64, 67],
 [19, 66, 98],
 [92, 69],
 [66, 66, 89],
 [75, 29],
 [75, 50],
 [22, 43]]

In [6]:
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
padded_reviews

array([[44, 69,  0],
       [38, 50,  0],
       [79,  3,  0],
       [16, 64, 67],
       [19, 66, 98],
       [92, 69,  0],
       [66, 66, 89],
       [75, 29,  0],
       [75, 50,  0],
       [22, 43,  0]])

In [7]:
embeded_vector_size = 4

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=max_length, name='Embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [8]:
x = padded_reviews
y = sentiment

In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding (Embedding)       (None, 3, 4)              400       
                                                                 
 flatten (Flatten)           (None, 12)                0         
                                                                 
 dense (Dense)               (None, 1)                 13        
                                                                 
Total params: 413
Trainable params: 413
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(x, y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x267c872e3e0>

In [11]:
loss, accuracy = model.evaluate(x, y)
accuracy



1.0

In [12]:
weights = model.get_layer('Embedding').get_weights()[0]

In [13]:
weights.shape

(100, 4)

In [14]:
weights[71]

array([ 0.01801881,  0.01914921, -0.04356462,  0.04951255], dtype=float32)

In [15]:
weights[73]

array([-0.00312037, -0.03276457,  0.00863148, -0.00931708], dtype=float32)

### IMDB Dataset

In [16]:
import pandas as pd

In [17]:
data = pd.read_csv('Train.csv')

In [18]:
data.shape

(40000, 2)

In [19]:
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [20]:
X = data['text']

In [21]:
y = data['label']

In [22]:
X

0        I grew up (b. 1965) watching and loving the Th...
1        When I put this movie in my DVD player, and sa...
2        Why do people who do not know what a particula...
3        Even though I have great interest in Biblical ...
4        Im a die hard Dads Army fan and nothing will e...
                               ...                        
39995    "Western Union" is something of a forgotten cl...
39996    This movie is an incredible piece of work. It ...
39997    My wife and I watched this movie because we pl...
39998    When I first watched Flatliners, I was amazed....
39999    Why would this film be so good, but only gross...
Name: text, Length: 40000, dtype: object

In [23]:
all_desc = set()

for i in X:
    tokens = i.split()
    all_desc.update(i.split())
all_desc

{"'Clockstoppers',",
 'Transformation.',
 'QURAN',
 'unfeeling',
 'Peace,',
 '/>Picturesque',
 'Cat.',
 'burn....and',
 'reveling',
 'Double-crosses',
 "'Speed",
 'Merritt',
 'Absence',
 'Chevy.<br',
 'Shahid',
 '/>***Attention',
 'undermined<br',
 'bondage-style',
 'Heart").',
 '/>Redford',
 'itself)',
 'Wanting',
 '/>October',
 'Villaronga',
 'picker.',
 'juvenility',
 'injuries',
 'Is',
 'decades.',
 'that...and',
 '47)',
 'ENTHUSIASM',
 'implicit,',
 'Joely',
 'green.<br',
 'naturalism',
 'Waterfall",',
 'off-limits.',
 '"X"',
 'group:',
 'Jared.',
 'Sawyer,',
 'Papa',
 "Apple's",
 'churn',
 'normally)',
 'did-',
 "Wincer's",
 'forgiveness--WTF!!',
 'Ola',
 'failed.<br',
 'set-piece',
 'pounce,',
 'Chance.<br',
 'paychecks.',
 'Manos:',
 'Main)',
 'Cop',
 '/>ThatWasJunk.Blogspot.com',
 '(Makes',
 'recordings",',
 'pens;',
 'I.O.',
 'originates',
 'Redux',
 'REVENGE',
 'comparative',
 '(naturally)',
 'Phillie',
 'LIQUID',
 'herbs,',
 'Porel),',
 'Parminder',
 "`solved'.<br",
 'Hangs

In [27]:
all_desc = np.array(all_desc)

In [28]:
all_desc.shape

(381542,)