In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
from tensorflow.keras.layers import LSTM, Dropout, Dense

In [8]:
%matplotlib inline

## Important Links to Follow Before Start
* ### [One Hot](https://youtu.be/f2fC1JUQ0Bk?list=PL1w8k37X_6L9s6pcqz4rAIEYZtF6zKjUE)
* ### [Word Embedding](https://youtu.be/8h8Z_pKyifM?list=PL1w8k37X_6L9s6pcqz4rAIEYZtF6zKjUE)
* ### [Word2Vec](https://youtu.be/UqRCEmrv1gQ?list=PLVBorYCcu-xWahQ0u2_guKSJ-0fc8VKb2)
* ### [Word2Wec with Gensim](https://youtu.be/UqRCEmrv1gQ?list=PLVBorYCcu-xWahQ0u2_guKSJ-0fc8VKb2)

# One Hot Encoding

In [9]:
from tensorflow.keras.preprocessing.text import one_hot

In [10]:
# Define Documents
doc = [
    'What a great day',       # doc1
    'well done',              # doc2
    'lets go for movie',
    'good morning sam',
    'all good ravi',
    'lets play game',
    'i will be busy tomorrow' # doc7 
]

In [11]:
vocab_size = 10000

In [12]:
encoded_doc = [one_hot(d, vocab_size) for d in doc]

In [13]:
print(encoded_doc)

[[1598, 6107, 3644, 6673], [6144, 9515], [2709, 5740, 8120, 7549], [3210, 6013, 8091], [5934, 3210, 9387], [2709, 5815, 6166], [3659, 520, 5540, 3069, 1906]]


# Embedding

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding



In [15]:
embedding_length = 5     # Each word is a vector of size 5   
max_doc_length = 10      # Each document is of size 10 containing word of size 5

encoded_docs = pad_sequences(encoded_doc, truncating='post', padding='post', maxlen=max_doc_length)

In [16]:
print(encoded_docs)

[[1598 6107 3644 6673    0    0    0    0    0    0]
 [6144 9515    0    0    0    0    0    0    0    0]
 [2709 5740 8120 7549    0    0    0    0    0    0]
 [3210 6013 8091    0    0    0    0    0    0    0]
 [5934 3210 9387    0    0    0    0    0    0    0]
 [2709 5815 6166    0    0    0    0    0    0    0]
 [3659  520 5540 3069 1906    0    0    0    0    0]]


In [17]:
test_model_for_embedding = Sequential()
test_model_for_embedding.add(Embedding(vocab_size, embedding_length, input_length=max_doc_length))
test_model_for_embedding.compile('rmsprop', 'mse')
output = test_model_for_embedding.predict(encoded_docs)
print(output)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
[[[-0.0407657   0.00082808 -0.03344538 -0.03795104 -0.00358046]
  [ 0.0038512  -0.02229105 -0.03164478 -0.01314639 -0.03597735]
  [ 0.02933874 -0.04330783 -0.0056602  -0.04495932  0.01970948]
  [ 0.02157218 -0.01278889 -0.02660227  0.01819995  0.0268586 ]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]
  [ 0.04511112  0.03394128 -0.0094348   0.04711225 -0.00564995]]

 [[ 0.00950653  0.01752256 -0.01530689  0.00021323 -0.04021711]
  [ 0.01869992 -0.03854641 -0.03478084 -0.00684898 -0.03216634]
  [ 0.04511112  0.03394128 -0.0

# Sentiment Analysis

In [18]:
(x_train, y_train),(x_test, y_test) = imdb.load_data()

In [19]:
x_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [20]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [21]:
x_train = pad_sequences(x_train, truncating='post', padding='post', maxlen=100)
x_test = pad_sequences(x_test, truncating='post', padding='post', maxlen=100)

In [22]:
x_test

array([[   1,  591,  202, ...,    0,    0,    0],
       [   1,   14,   22, ...,  157,    9,    4],
       [   1,  111,  748, ...,   29,   93,   11],
       ...,
       [   1,   13, 1408, ...,    0,    0,    0],
       [   1,   11,  119, ...,    7,  470,    0],
       [   1,    6,   52, ...,   17,  210,    5]])

In [23]:
dic_size = 20000
embed_size = 128

In [24]:
sentiment_model = Sequential()

In [25]:
sentiment_model.add(Embedding(dic_size, embed_size, input_shape = (x_train.shape[1],)))

In [26]:
sentiment_model.add(LSTM(units = 128, activation = 'tanh'))

In [27]:
sentiment_model.add(Dense(units = 1, activation = 'softmax'))

In [28]:
sentiment_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [29]:
sentiment_model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [31]:
sentiment_model.fit(x_train, y_train, epochs=10, batch_size=128)

Train on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x21ba6933b08>