In [0]:
#Importing necesscary libraries
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import  Bidirectional
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd

In [109]:
#Creating a dataframe of relevant data and checking for important columns
df = pd.read_csv('tmdb_5000_movies.csv')
df.head(2).T

Unnamed: 0,0,1
budget,237000000,300000000
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""..."
homepage,http://www.avatarmovie.com/,http://disney.go.com/disneypictures/pirates/
id,19995,285
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
original_language,en,en
original_title,Avatar,Pirates of the Caribbean: At World's End
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha..."
popularity,150.438,139.083
production_companies,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""..."


In [110]:
#Cleaning the dataset and removing duplicate values
print('{:>28}'.format('entries from dataset:'), df.shape[0])
df = df.drop_duplicates(['original_title'])
print('{:>28}'.format('entries without duplication:'), df.shape[0])
df_clean = df[['genres', 'title', 'overview']].dropna()
print('{:>28}'.format('entries from cleaned data:'), df_clean.shape[0])

       entries from dataset: 4803
entries without duplication: 4801
  entries from cleaned data: 4798


In [0]:
#Split the records for each genre
df_genre = pd.DataFrame(columns = ['genre', 'cgenres','title','overview'])

def dataPrep(row):
    global df_genre
    d = {}
    genres = np.array([g['name'] for g in eval(row['genres'])])
    n = genres.size
    d['title'] = [row['title']]*n
    d['overview'] = [row['overview']]*n
    d['genre'], d['cgenres'] = [], []
    for genre in genres:
        d['genre'].append(genre)
        d['cgenres'].append(genres[genres != genre])
    df_genre = df_genre.append(pd.DataFrame(d), ignore_index=True)
df_clean.apply(dataPrep, axis=1)
df_genre = df_genre[['genre','cgenres','title','overview']]
df_genre = df_genre.infer_objects()

In [112]:
#Number of records
df_genre.shape

(12151, 4)

In [113]:
#Transferring the new clean dataframe to a new dataframe for sequential modelling
df_clean = df_genre
df_clean.head()

Unnamed: 0,genre,cgenres,title,overview
0,Action,"[Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Adventure,"[Action, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
2,Fantasy,"[Action, Adventure, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
3,Science Fiction,"[Action, Adventure, Fantasy]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
4,Adventure,"[Fantasy, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."


In [0]:
#The three sets of genres for which plots(overviews) have to be generated 
x = ['Family','Thriller','Comedy']
y = ['Fantasy', 'Documentary', 'Action']
z = ['Family', 'Adventure']

In [115]:
#Selecting the genre
df_clean = df_clean.loc[df_clean['genre'].isin(y)]
print(len(df_clean))
df_clean.head()

1686


Unnamed: 0,genre,cgenres,title,overview
0,Action,"[Adventure, Fantasy, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
2,Fantasy,"[Action, Adventure, Science Fiction]",Avatar,"In the 22nd century, a paraplegic Marine is di..."
5,Fantasy,"[Adventure, Action]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
6,Action,"[Adventure, Fantasy]",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
7,Action,"[Adventure, Crime]",Spectre,A cryptic message from Bond’s past sends him o...


In [116]:
#Finding the number of plots and total characters
plots = df_clean['overview']

n_messages = len(plots)
n_chars = len(' '.join(map(str, plots)))

print("The number of plots are",n_messages)
print("Their content adds up to a total character count of", n_chars)

The number of plots are 1686
Their content adds up to a total character count of 547182


In [117]:
#Converting all characters to lowercase and creating a long sequence of relevant plots
sample_size = int(len(plots))

plots = plots[:sample_size]
plots = ''.join(map(str, plots)).lower()

plots[:100] # Show first 100 characters

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but '

In [118]:
#Finding the number of unique characters
chars = sorted(list(set(plots)))
print('Count of unique characters (i.e., features):', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

Count of unique characters (i.e., features): 79


In [119]:
#Modelling the sequences
maxlen = 50
step = 1
sentences = []
next_chars = []
for i in range(0, len(plots) - maxlen, step):
    sentences.append(plots[i: i + maxlen])
    next_chars.append(plots[i + maxlen])
print('Number of sequences:', len(sentences), "\n")

print(sentences[:10], "\n")
print(next_chars[:10])

Number of sequences: 545447 

['in the 22nd century, a paraplegic marine is dispat', 'n the 22nd century, a paraplegic marine is dispatc', ' the 22nd century, a paraplegic marine is dispatch', 'the 22nd century, a paraplegic marine is dispatche', 'he 22nd century, a paraplegic marine is dispatched', 'e 22nd century, a paraplegic marine is dispatched ', ' 22nd century, a paraplegic marine is dispatched t', '22nd century, a paraplegic marine is dispatched to', '2nd century, a paraplegic marine is dispatched to ', 'nd century, a paraplegic marine is dispatched to t'] 

['c', 'h', 'e', 'd', ' ', 't', 'o', ' ', 't', 'h']


In [0]:
#Converting the characters for inputs into the LSTM model
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [0]:
#Importing additional libraries
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback, ModelCheckpoint
import random
import sys
import io

In [0]:
#Creating the model
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def on_epoch_end(epoch, logs):
    # Function invoked for specified epochs. Prints generated text.
    # Using epoch+1 to be consistent with the training epochs printed by Keras
    if epoch+1 == 1 or epoch+1 == 15:
        print()
        print('----- Generating text after Epoch: %d' % epoch)

        start_index = random.randint(0, len(plots) - maxlen - 1)
        for diversity in [0.7]:
            print('----- diversity:', diversity)

            generated = ''
            sentence = plots[start_index: start_index + maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(400):
                x_pred = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                if (i%100==0):
                  print("\n")
                sys.stdout.flush()
            print()
    else:
        print()
        print('----- Not generating text after Epoch: %d' % epoch)

generate_text = LambdaCallback(on_epoch_end=on_epoch_end)

In [125]:
# fitting the model and producing plots for  for ['Fantasy', 'Documentary', 'Action']
    model.fit(x, y,
              batch_size=128,
              epochs=1,
              verbose=2,
              callbacks=[generate_text])

Epoch 1/1
 - 566s - loss: 1.8771

----- Generating text after Epoch: 0
----- diversity: 0.7
----- Generating with seed: "uld have a major impact on global politics. when m"
uld have a major impact on global politics. when mi

ddle of witne has strendst of the class aboue to an atanist gives caro also to soon who and long the

 peciales and all into the mussion an arries comes any a hopen sente box at his capatic deseat a sev

ien to transforms of the childrenus of has bely a set and a from superon was also just in the rooked

 protect he an every the boy and a become for a polace and to fighting his moved hampses of middici


<keras.callbacks.History at 0x7fe129e2eba8>

In [89]:
# fitting the model and producing plots for genres ['Family, 'Adventure']
    model.fit(x, y,
              batch_size=128,
              epochs=1,
              verbose=2,
              callbacks=[generate_text])

Epoch 1/1
 - 431s - loss: 1.8984

----- Generating text after Epoch: 0
----- diversity: 0.7
----- Generating with seed: "he last of his kind, but when word comes that jewe"
he last of his kind, but when word comes that jewel

e and its the chare of a son as believes susron to dony who massion and from traes is the mastered t

o with the forced to live his set to be the that the broodles the masion of fastrol about father bea

t the back in the loss enawes a haspay a trotent complete to return on a timting the faces of a beal

 and sucred years of every realh of a staring exector no merr of a rady wast all dog who death, shi


<keras.callbacks.History at 0x7fe10d623518>

In [107]:
# fitting the model and producing plots for genres ['Family','Thriller','Comedy']
    model.fit(x, y,
              batch_size=128,
              epochs=1,
              verbose=2,
              callbacks=[generate_text])

Epoch 1/1
 - 1061s - loss: 1.7938

----- Generating text after Epoch: 0
----- diversity: 0.7
----- Generating with seed: "cademy award winner geoffrey rush.a man and his fr"
cademy award winner geoffrey rush.a man and his fri

ends between and the character, james of by body pressee the house of the first uncovers a long the 

couranh, and the coust of the dofter of a hoternd with to the gramoure and rome of the mell cea youn

g core and south who trive to ternac could to pretent of the boy of end the chace or all to does tim

e of his top of the city, a storylo, the first with sean billive to the fortunater finds a girlfrie


<keras.callbacks.History at 0x7fdff4269f98>