#Text mining with LSTM

Keras examples on Github: https://github.com/keras-team/keras/tree/master/examples

In [3]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import LSTM
from keras.layers import  Bidirectional
import numpy as np
import pandas as pd

print(tf.__version__)

1.12.0


Using TensorFlow backend.


In [10]:
tmdb.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

Import IMDB Data

In [0]:
#Creating a dataframe of relevant data
tmdb = pd.read_csv('tmdb_5000_movies.csv')
tmdb = tmdb.fillna(0)
plots = tmdb[['title','genres', 'overview']]


In [0]:
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

Training entries: 25000, labels: 25000


Helper function to decode integers to words

In [0]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


## Data Prep
Padding vectors to be of same length

In [0]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)

In [0]:
print(train_data.shape[1])

256


## Build LSTM Model

In [0]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000
embedding_dim = 32
dropout_rate=0.1

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=train_data.shape[1]))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 32)           320000    
_________________________________________________________________
dropout (Dropout)            (None, 256, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 402,561
Trainable params: 402,561
Non-trainable params: 0
_________________________________________________________________


Compile

In [0]:
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

Validation Data

In [0]:
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

Training

In [0]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=2,
                    batch_size=32,
                    validation_data=(x_val, y_val),
                    verbose=1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


##Bidirectional LSTM model

In [0]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)

vocab_size = 10000
embedding_dim = 50
dropout_rate=0.1

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=train_data.shape[1]))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(64)))
model.add(keras.layers.Dropout(dropout_rate))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 256, 50)           500000    
_________________________________________________________________
dropout_8 (Dropout)          (None, 256, 50)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               58880     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 559,009
Trainable params: 559,009
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [0]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=2,
                    batch_size=32,
                    validation_data=(x_val, y_val),
                    verbose=1)

Train on 15000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2
