In [27]:
# Imports
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

import numpy as np

In [28]:
# Step 1: Load the corpus (user input - hardcode here) for which you want prediction from the model trained in imdb dataset.

# maximum_vocabulary_size - this is the maximum number of words we want to use in our vocabulary. 
#  - If your dataset only has 2,000 unique words, setting 10,000 just allocates extra unused capacity
#  - This is a hyperparameter and you can change it as per your requirement.
# - Common ranges:
# - Small dataset → 5k–10k
# - Medium → 20k–50k
# - Large → 100k+

# max_words_in_sentence = maximum words allowed in a sentence
# - This is also a hyperparameter
# - Most of the sentences below have 4 to 5 words.
# - We will pad the sentence with 0's which for sentences having less than 10 words.

# max_size_feature_matrix_dimension 
# - Sometimes we create a Vector Matrix (text to numerical conversion) using OHE but that has some drawbacks like Sparse matrix (leading to overfitting)
# - In this example we are using a Embedding Vector Matrix to convert the text in corpus to numerical data.
# - The numerical Embedding Vector Matrix is created by the co-relation between (1) Feature Matrix and (2) Words in the corpus/sentences.
# - The Feature matrix is a built in functionality provided by Tensor Flow.

maximum_vocabulary_size=10000
max_words_in_sentence=8
max_size_feature_matrix_dimension = 10
sentences=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good',]
sentences

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [29]:
# Step 2: Create the One Hot Representation for the entire corpus
# - This will convert each sentence into a list of integers, 
# - where each integer represents the index in OHE will it will be
# - e.g. an index=3091 will represent that the 3091st value = '1' and rest will be '0' in the OHE vector for that word.
one_hot_representation=[one_hot(words,maximum_vocabulary_size)for words in sentences]
one_hot_representation

[[3091, 6457, 9616, 6566],
 [3091, 6457, 9616, 6892],
 [3091, 2301, 9616, 2942],
 [6067, 5050, 52, 5127, 7790],
 [6067, 5050, 52, 5127, 3063],
 [6752, 3091, 9904, 9616, 56],
 [1903, 1299, 6815, 5127]]

In [None]:
# Step 3 - embedded_sentences = Pad the sentences (from the corpus) with 0's wherever the number of words is less than the 10 (i.e. max_words_in_sentence)
# pre = pad with 0's at the beginning of the sentence
# post = pad with 0's at the end of the sentence
embedded_sentences=pad_sequences(one_hot_representation,padding='pre',maxlen=max_words_in_sentence)
print(embedded_sentences)

[[   0    0    0    0 3091 6457 9616 6566]
 [   0    0    0    0 3091 6457 9616 6892]
 [   0    0    0    0 3091 2301 9616 2942]
 [   0    0    0 6067 5050   52 5127 7790]
 [   0    0    0 6067 5050   52 5127 3063]
 [   0    0    0 6752 3091 9904 9616   56]
 [   0    0    0    0 1903 1299 6815 5127]]


In [31]:
# Step 4) Create the Simple RNN model
# - Optimizer = adam
# - Loss = mse (mean squared error)
simple_rnn_model=Sequential()
simple_rnn_model.add(Embedding(maximum_vocabulary_size,max_size_feature_matrix_dimension,input_length=max_words_in_sentence))
simple_rnn_model.compile('adam','mse')

simple_rnn_model.summary()



In [32]:
simple_rnn_model.predict(embedded_sentences[0])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


array([[-0.02335816,  0.04684843,  0.03346795, -0.00515372, -0.03907485,
        -0.02256098, -0.02770551,  0.02614475, -0.00350161, -0.01254909],
       [-0.02335816,  0.04684843,  0.03346795, -0.00515372, -0.03907485,
        -0.02256098, -0.02770551,  0.02614475, -0.00350161, -0.01254909],
       [-0.02335816,  0.04684843,  0.03346795, -0.00515372, -0.03907485,
        -0.02256098, -0.02770551,  0.02614475, -0.00350161, -0.01254909],
       [-0.02335816,  0.04684843,  0.03346795, -0.00515372, -0.03907485,
        -0.02256098, -0.02770551,  0.02614475, -0.00350161, -0.01254909],
       [ 0.03415037, -0.03608763, -0.03714902,  0.01151562,  0.02168988,
         0.01019244, -0.02505626,  0.02347218, -0.03451884,  0.03540734],
       [-0.0149675 , -0.04176452, -0.03624142,  0.00864894, -0.0376987 ,
         0.01381013, -0.03250038,  0.01351712,  0.04853537,  0.04407034],
       [ 0.04064978, -0.01172294,  0.00267683,  0.03931511,  0.03503333,
         0.01870123, -0.01798465,  0.04737829