In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from tensorflow.keras.layers import LSTM, Dropout, Dense

In [3]:
%matplotlib inline

## Important Links to Follow Before Start
* ### [One Hot](https://youtu.be/f2fC1JUQ0Bk?list=PL1w8k37X_6L9s6pcqz4rAIEYZtF6zKjUE)
* ### [Word Embedding](https://youtu.be/8h8Z_pKyifM?list=PL1w8k37X_6L9s6pcqz4rAIEYZtF6zKjUE)
* ### [Word2Vec](https://youtu.be/UqRCEmrv1gQ?list=PLVBorYCcu-xWahQ0u2_guKSJ-0fc8VKb2)
* ### [Word2Wec with Gensim](https://youtu.be/UqRCEmrv1gQ?list=PLVBorYCcu-xWahQ0u2_guKSJ-0fc8VKb2)

# One Hot Encoding

In [4]:
from tensorflow.keras.preprocessing.text import one_hot

In [5]:
# Define Documents
doc = [
    'What a great day',       # doc1
    'well done',              # doc2
    'lets go for movie',
    'good morning sam',
    'all good ravi',
    'lets play game',
    'i will be busy tomorrow' # doc7 
]

In [6]:
vocab_size = 10000

In [7]:
encoded_doc = [one_hot(d, vocab_size) for d in doc]

In [8]:
print(encoded_doc)

[[3598, 1205, 6132, 1142], [1822, 629], [7522, 9331, 5059, 5360], [6900, 8488, 1714], [8381, 6900, 3241], [7522, 7015, 7008], [5123, 2649, 2359, 7853, 3760]]


# Embedding

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding



In [10]:
embedding_length = 5     # Each word is a vector of size 5   
max_doc_length = 10      # Each document is of size 10 containing word of size 5

encoded_docs = pad_sequences(encoded_doc, truncating='post', padding='post', maxlen=max_doc_length)

In [11]:
print(encoded_docs)

[[3598 1205 6132 1142    0    0    0    0    0    0]
 [1822  629    0    0    0    0    0    0    0    0]
 [7522 9331 5059 5360    0    0    0    0    0    0]
 [6900 8488 1714    0    0    0    0    0    0    0]
 [8381 6900 3241    0    0    0    0    0    0    0]
 [7522 7015 7008    0    0    0    0    0    0    0]
 [5123 2649 2359 7853 3760    0    0    0    0    0]]


In [12]:
test_model_for_embedding = Sequential()
test_model_for_embedding.add(Embedding(vocab_size, embedding_length, input_length=max_doc_length))
test_model_for_embedding.compile('rmsprop', 'mse')
output = test_model_for_embedding.predict(encoded_docs)
print(output)

[[[ 0.02191532 -0.02286979 -0.01333144 -0.04876316 -0.00194839]
  [ 0.0356927  -0.04041449 -0.00458374  0.03627893 -0.00322413]
  [-0.03345238 -0.04686301 -0.00079614  0.02557181  0.01655671]
  [ 0.02212344  0.0309807   0.02177382 -0.02774094  0.04610695]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]]

 [[-0.03989177  0.01226841  0.03556999  0.04809595  0.00817827]
  [-0.0361161  -0.02715679  0.01747483 -0.03065367  0.04357379]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155 -0.04837134 -0.01038669]
  [ 0.04480812 -0.04044288 -0.01718155

# Sentiment Analysis

In [13]:
(x_train, y_train),(x_test, y_test) = imdb.load_data()

In [14]:
x_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [15]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [73]:
x_train = pad_sequences(x_train, truncating='post', padding='post', maxlen=100)
x_test = pad_sequences(x_test, truncating='post', padding='post', maxlen=100)

In [79]:
x_test

array([[   1,  591,  202, ...,    0,    0,    0],
       [   1,   14,   22, ...,  157,    9,    4],
       [   1,  111,  748, ...,   29,   93,   11],
       ...,
       [   1,   13, 1408, ...,    0,    0,    0],
       [   1,   11,  119, ...,    7,  470,    0],
       [   1,    6,   52, ...,   17,  210,    5]])

In [140]:
dic_size = 20000
embed_size = 128

In [141]:
sentiment_model = Sequential()

In [142]:
sentiment_model.add(Embedding(dic_size, embed_size, input_shape = (x_train.shape[1],)))

In [143]:
sentiment_model.add(LSTM(units = 128, activation = 'tanh'))

In [144]:
sentiment_model.add(Dense(units = 1, activation = 'softmax'))

In [145]:
sentiment_model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 128)          2560000   
_________________________________________________________________
lstm_19 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [146]:
sentiment_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [147]:
sentiment_model.fit(x_train, y_train, epochs=3, batch_size=128)

Train on 25000 samples
Epoch 1/3
  128/25000 [..............................] - ETA: 5:05

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[120,41] = 74421 is not in [0, 20000)
	 [[node sequential_13/embedding_13/embedding_lookup (defined at <ipython-input-147-e627f99887bb>:1) ]]
	 [[VariableShape/_22]]
  (1) Invalid argument:  indices[120,41] = 74421 is not in [0, 20000)
	 [[node sequential_13/embedding_13/embedding_lookup (defined at <ipython-input-147-e627f99887bb>:1) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_50584]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_13/embedding_13/embedding_lookup:
 sequential_13/embedding_13/embedding_lookup/49397 (defined at C:\Users\eds16\Anaconda3\lib\contextlib.py:112)

Input Source operations connected to node sequential_13/embedding_13/embedding_lookup:
 sequential_13/embedding_13/embedding_lookup/49397 (defined at C:\Users\eds16\Anaconda3\lib\contextlib.py:112)

Function call stack:
distributed_function -> distributed_function
