I will run through a simple LSTM in this section with keras, and have taken help from [here](http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/) :

In [12]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import theano
# fix random seed for reproducibility
numpy.random.seed(7)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)

In [3]:
# So, what's the shape here
print (X_train.shape, y_train.shape)

(25000,) (25000,)


In [7]:
# That means we have 25000 data points, now lets see whats inside them
print (X_train[0])
print (y_train[0])
print (len(X_train[0]), len(X_train[1]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]
1
218 189


In [9]:
# As we have preprocessed data it represents the sentence in vector form. (See we have sentences of different length)
total_len = 0
for i in X_train:
    total_len += len(i)
for i in X_test:
    total_len += len(i)
print (total_len // (X_train.shape[0] + X_test.shape[0])) # average

234


In [13]:
# So lets pad or trim the sentences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print (X_train[0])
print (len(X_train[0]), len(X_train[1]))

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0  

In [30]:
# Now we have equal sized training data (Needed so for the LSTM)
# create the model
embedding_vector_length = 32

# We are using a sequential model here, I will approach a functional model later
model = Sequential()
# We already have the sentences as vector representation, so why is the embedding used here ?
# Because we don't need the distance between 'north' and 'south' same as 'modi' and 'kejriwal'. You can use GloVe here.
# There are other reasons also.
embedding = Embedding(top_words, embedding_vector_length, input_length=max_review_length)
model.add(embedding)
# LSTM of 100 hidden units (more on it later)
lstm = LSTM(100)
model.add(lstm)
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Here are the functions to visualize the model.
embedding_out = theano.function([model.input], [embedding.output])
lstm_out = theano.function([model.input], [lstm.output])

In [33]:
e_out = embedding_out([X_train[0]])
print (len(e_out), e_out[0].shape, e_out)

1 (1, 500, 32) [array([[[-0.01909696, -0.00128354, -0.04350078, ...,  0.04515557,
         -0.04617136, -0.03478462],
        [-0.01909696, -0.00128354, -0.04350078, ...,  0.04515557,
         -0.04617136, -0.03478462],
        [-0.01909696, -0.00128354, -0.04350078, ...,  0.04515557,
         -0.04617136, -0.03478462],
        ..., 
        [ 0.02845694,  0.01439202, -0.04001575, ...,  0.04781301,
         -0.04650199, -0.01355803],
        [ 0.01874172,  0.03280069,  0.00071394, ...,  0.01790861,
         -0.00488006, -0.00624097],
        [ 0.04649385,  0.01548138,  0.00673046, ..., -0.0140023 ,
         -0.02662515,  0.03282387]]], dtype=float32)]


In [34]:
# We see that we are giving a input of `input_length = max_review_length` (that's 500 = len(X_train[0]))
# The embedding layer has:
#   first_argument = input_dim = top_words = 5000 (No value in input should be greater than this)
#   second_argument = output_dim = 32
# More here: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
# So we get output which is of 1 dimension (denotes number of epoch), which in turn is of
# 500 dimensions (number of words) which in turn are of 32 dimensions (output_dime)
l_out = lstm_out([X_train[0]])
print (l_out[0].shape, l_out)

(1, 100) [array([[ -8.75755120e-03,   2.35159490e-02,   7.24158762e-03,
          5.75434649e-03,   7.76602514e-03,   6.44575851e-03,
         -1.07395388e-02,  -1.99354794e-02,  -1.67083833e-02,
          1.39765628e-02,  -9.82308853e-03,   1.44931360e-03,
          1.96614452e-02,   1.79646432e-03,  -3.28742759e-03,
         -7.29508698e-04,   1.29413130e-02,  -2.22204439e-03,
         -8.85574613e-03,  -7.34617747e-03,  -2.32800539e-03,
         -3.01935384e-03,  -1.22768572e-02,   2.00342480e-02,
         -1.16039836e-03,   1.11591378e-02,   4.85542044e-03,
          1.24953324e-02,   1.13097318e-02,   1.64835844e-02,
         -1.26700439e-02,  -1.70683097e-02,   3.61562381e-03,
         -8.89317039e-03,  -9.64088645e-03,   7.36068236e-03,
          5.49160503e-03,   2.23985426e-02,  -9.45511088e-03,
          7.04189064e-03,  -3.34612802e-02,   2.03254614e-02,
         -2.15605870e-02,  -2.68418971e-03,  -7.75714964e-03,
          5.58061386e-03,  -1.00356992e-02,  -1.92607623e-02

In [37]:
# In LSTM, the first argument is the output dimension
# So we get output which is of 1 dimension (denotes number of epoch), which is of 100 dimensions (output_dim)
print(model.summary())
# The dense layer is easy to understand

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 500, 32)       160000      embedding_input_8[0][0]          
____________________________________________________________________________________________________
lstm_8 (LSTM)                    (None, 100)           53200       embedding_8[0][0]                
____________________________________________________________________________________________________
dense_8 (Dense)                  (None, 1)             101         lstm_8[0][0]                     
Total params: 213301
____________________________________________________________________________________________________
None


In [38]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64, )

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fed112617b8>

In [39]:
# Well we have used the same validation data and test data
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.61%
