In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from pickle import dump
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import SelectionSlider

Using TensorFlow backend.


In [2]:
df = pd.read_csv("IMDB_Sept.csv")
# drop the weird column
df = df.drop(columns='Unnamed: 0', axis=1)

df = df.loc[df['Votes'] >= 100000] # keep movies with more than 100000 votes in IMDB, keeps popular movies 
df = df.drop(['Budget($)', 'Genres', 'IMDB_Score', 'Metascore', 'Production_Company', 
              'Rated', 'Release_Date', 'Storyline', 'Summary', 'Votes', 'Director', 'Runtime',
             'Actors'], axis=1)
df = df.dropna()
df.shape

(1709, 1)

In [3]:
titles = np.array(df.Title) # grab titles

titles = np.array([i[:-7].lower() for i in titles]) # remove year and lower words

word_pat = r'[A-Za-z]+'
titles = np.array([re.findall(word_pat, i) for i in titles]) # extract real words from titles

titles = np.hstack(titles) # join lists together

titles = np.array([i.replace('-', '') for i in titles]) # replace - with ' '

titles = np.array([i for i in titles if len(list(i))>3]) # remove words with less than 3 characters

train = pd.DataFrame(titles, columns=['Word'])
train.head(5)

Unnamed: 0,Word
0,bend
1,like
2,beckham
3,yojimbo
4,times


In [4]:
train.shape

(2982, 1)

In [5]:
train['word_length'] = [len(i) for i in train.Word]
train.sort_values(by='word_length').tail()

Unnamed: 0,Word,word_length
1603,transcendence,13
915,adventureland,13
1365,revolutionary,13
1799,predestination,14
1123,blackkklansman,14


In [6]:
train['chars'] = np.array([list(i) for i in train.Word])
train.sort_values(by='word_length').tail()

Unnamed: 0,Word,word_length,chars
1603,transcendence,13,"[t, r, a, n, s, c, e, n, d, e, n, c, e]"
915,adventureland,13,"[a, d, v, e, n, t, u, r, e, l, a, n, d]"
1365,revolutionary,13,"[r, e, v, o, l, u, t, i, o, n, a, r, y]"
1799,predestination,14,"[p, r, e, d, e, s, t, i, n, a, t, i, o, n]"
1123,blackkklansman,14,"[b, l, a, c, k, k, k, l, a, n, s, m, a, n]"


In [7]:
max_length = train.word_length.max()

X = []
y = []
space = ['_']

for word in train.chars:
    temp = word.copy()
    temp += (space)
    
    for i,v in enumerate(temp):
        if i != len(temp)-1:
            X_output = temp[:i+1]
            X_output += space*(max_length-i-1)
            
            X.append(X_output)
            y.append(temp[i+1])

X = np.array(X)
y = np.array(y)
X.shape, y.shape

((18006, 14), (18006,))

In [8]:
column_names = ['pos'+str(i) for i in np.arange(14)]
seq_df = pd.DataFrame(X, columns = column_names)
seq_df['target'] = y
seq_df.head()

Unnamed: 0,pos0,pos1,pos2,pos3,pos4,pos5,pos6,pos7,pos8,pos9,pos10,pos11,pos12,pos13,target
0,b,_,_,_,_,_,_,_,_,_,_,_,_,_,e
1,b,e,_,_,_,_,_,_,_,_,_,_,_,_,n
2,b,e,n,_,_,_,_,_,_,_,_,_,_,_,d
3,b,e,n,d,_,_,_,_,_,_,_,_,_,_,_
4,l,_,_,_,_,_,_,_,_,_,_,_,_,_,i


In [11]:
seq_df = seq_df[seq_df.pos1 != '_']
seq_df.head()

Unnamed: 0,pos0,pos1,pos2,pos3,pos4,pos5,pos6,pos7,pos8,pos9,pos10,pos11,pos12,pos13,target
1,b,e,_,_,_,_,_,_,_,_,_,_,_,_,n
2,b,e,n,_,_,_,_,_,_,_,_,_,_,_,d
3,b,e,n,d,_,_,_,_,_,_,_,_,_,_,_
5,l,i,_,_,_,_,_,_,_,_,_,_,_,_,k
6,l,i,k,_,_,_,_,_,_,_,_,_,_,_,e


In [12]:
seq_df.shape

(15024, 15)

In [13]:
train.head()

Unnamed: 0,Word,word_length,chars
0,bend,4,"[b, e, n, d]"
1,like,4,"[l, i, k, e]"
2,beckham,7,"[b, e, c, k, h, a, m]"
3,yojimbo,7,"[y, o, j, i, m, b, o]"
4,times,5,"[t, i, m, e, s]"


In [14]:
train.shape

(2982, 3)

In [15]:
chars = np.hstack(train.chars)
ordered_chars = sorted(list(set(chars))) # chars in our corpus
indices = np.arange(1, len(ordered_chars)+1) # indices for encoding

char_to_number = dict(zip(ordered_chars, indices))
char_to_number['_'] = 0

number_to_char = dict(zip(indices, ordered_chars))
number_to_char[0] = '_'

char_to_number, number_to_char

({'a': 1,
  'b': 2,
  'c': 3,
  'd': 4,
  'e': 5,
  'f': 6,
  'g': 7,
  'h': 8,
  'i': 9,
  'j': 10,
  'k': 11,
  'l': 12,
  'm': 13,
  'n': 14,
  'o': 15,
  'p': 16,
  'q': 17,
  'r': 18,
  's': 19,
  't': 20,
  'u': 21,
  'v': 22,
  'w': 23,
  'x': 24,
  'y': 25,
  'z': 26,
  '_': 0},
 {1: 'a',
  2: 'b',
  3: 'c',
  4: 'd',
  5: 'e',
  6: 'f',
  7: 'g',
  8: 'h',
  9: 'i',
  10: 'j',
  11: 'k',
  12: 'l',
  13: 'm',
  14: 'n',
  15: 'o',
  16: 'p',
  17: 'q',
  18: 'r',
  19: 's',
  20: 't',
  21: 'u',
  22: 'v',
  23: 'w',
  24: 'x',
  25: 'y',
  26: 'z',
  0: '_'})

In [16]:
# save the mappings

dump(number_to_char, open('char_models/number_to_char.pkl', 'wb'))
dump(char_to_number, open('char_models/char_to_number.pkl', 'wb'))

- ## encode chars to numbers

In [17]:
X_num = np.array([char_to_number[i] for word in X for i in word]).reshape(X.shape[0], X.shape[1])
X_num = np.reshape(X_num, (X_num.shape[0], X_num.shape[1], 1))
X_num.shape

(18006, 14, 1)

In [18]:
y_num = np.array([char_to_number[i] for i in y])
# encoded = to_categorical(y_num)
y_num = to_categorical(y_num)
y_num.shape

(18006, 27)

In [19]:
# define model
model = Sequential()
model.add(LSTM(100, input_shape=(X_num.shape[1], X_num.shape[2]), return_sequences=False))
model.add(Dense(60, activation='relu'))
model.add(Dropout(.1))
model.add(Dense(len(number_to_char), activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               40800     
_________________________________________________________________
dense_1 (Dense)              (None, 60)                6060      
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 27)                1647      
Total params: 48,507
Trainable params: 48,507
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X_num, y_num, epochs=150, verbose=2)

Epoch 1/150
 - 5s - loss: 2.7377 - accuracy: 0.2109
Epoch 2/150
 - 4s - loss: 2.6657 - accuracy: 0.2295
Epoch 3/150
 - 4s - loss: 2.6363 - accuracy: 0.2329
Epoch 4/150
 - 4s - loss: 2.6196 - accuracy: 0.2390
Epoch 5/150
 - 4s - loss: 2.5969 - accuracy: 0.2399
Epoch 6/150
 - 4s - loss: 2.5787 - accuracy: 0.2415
Epoch 7/150
 - 4s - loss: 2.5648 - accuracy: 0.2436
Epoch 8/150
 - 4s - loss: 2.5497 - accuracy: 0.2433
Epoch 9/150
 - 4s - loss: 2.5307 - accuracy: 0.2468
Epoch 10/150
 - 4s - loss: 2.5148 - accuracy: 0.2489
Epoch 11/150
 - 4s - loss: 2.4942 - accuracy: 0.2522
Epoch 12/150
 - 5s - loss: 2.4765 - accuracy: 0.2556
Epoch 13/150
 - 5s - loss: 2.4503 - accuracy: 0.2569
Epoch 14/150
 - 5s - loss: 2.4337 - accuracy: 0.2604
Epoch 15/150
 - 4s - loss: 2.4028 - accuracy: 0.2661
Epoch 16/150
 - 4s - loss: 2.3764 - accuracy: 0.2709
Epoch 17/150
 - 4s - loss: 2.3441 - accuracy: 0.2751
Epoch 18/150
 - 4s - loss: 2.3278 - accuracy: 0.2814
Epoch 19/150
 - 4s - loss: 2.2968 - accuracy: 0.2834
Ep

<keras.callbacks.callbacks.History at 0x638b6df98>

In [21]:
# save the model to file
model.save('char_models/model150.h5')

# Use model and mappings to make predictions

In [22]:
from pickle import load
from keras.models import load_model
import numpy as np

In [23]:
# load the model
model = load_model('char_models/model150.h5')

In [24]:
# load the mapping
char_to_number = load(open('char_models/char_to_number.pkl', 'rb'))
number_to_char = load(open('char_models/number_to_char.pkl', 'rb'))

# Predict word -- one char look ahead

In [102]:
test_word = 'godf' # specify test word

keep_predicting = True
while keep_predicting:
    
    temp = [char_to_number[i] for i in test_word] # convert to numbers
    temp += ([0]*(X.shape[1]-len(test_word))) # add remaining zeros(__ character) to array
    temp = np.array(temp)
    temp = temp.reshape(1,14,1) # reshape to correct input for model
    character_prediction = number_to_char[np.argmax(model.predict(temp))]

    if character_prediction != '_':
        test_word += character_prediction
    else:
        print(test_word)
        keep_predicting = False

godfather


# Predict word -- two char look ahead

- model produces a probability distribution for next character based on a given string X
- we need to decide which character is going to come after X
- we take the 2 characters with the highest probablities, lets call them A and B
- we create 2 new strings with these characters added on, XA and XB
- we run the model on XA and XB and create 2 new probability distributions
- we then take the top character from each probability distribution to create XAC and XBD
- if the product of the probabilities A and C is greater than the product of B and D, we choose A as the next character giving us XA
- if the product of the probabilities A and C is less than the product of B and D, we choose B as the next character giving us XB.
- The process is repeated untill we predict that the next character is a space (end of the word)

In [93]:
def predict_word(test_word):
    if len(test_word)>1:
        test_word=test_word.lower()
        keep_predicting = True
        while keep_predicting:

            temp = [char_to_number[i] for i in test_word] # convert to numbers
            temp += ([0]*(X.shape[1]-len(test_word))) # add remaining zeros(__ character) to array
            temp = np.array(temp)
            temp = temp.reshape(1,14,1) # reshape to correct input for model

            # get top char
            pred = model.predict(temp) # predict top char, prob dist of characters
            top_num_index = np.argmax(pred) # get index of top char
            top_prob = pred.max()
            top_char = number_to_char[top_num_index] # convert to character

            # get second top char
            pred[0][top_num_index]=0 # top value set to 0
            second_top_num_index = np.argmax(pred) # get index of new top char
            second_top_prob = pred.max()
            second_top_char = number_to_char[second_top_num_index] # convert to character

            # get top char from both chars above
            # new test word with top char
            test_word1 = temp.copy()
            test_word1[0][len(test_word)-1] = top_num_index

            # new test word with second top char
            test_word2 = temp.copy()
            test_word2[0][len(test_word)-1] = second_top_num_index

            pred_test_word1 = model.predict(test_word1)
            pred_test_word2 = model.predict(test_word2)

            if top_prob*pred_test_word1.max()>second_top_prob*pred_test_word2.max(): # if first letter combination is biggest
                if top_char == '_':
                    keep_predicting = False
                else:
                    test_word += top_char
            else: # else if second letter combination is biggest
                if second_top_char == '_':
                    keep_predicting = False
                else:
                    test_word += second_top_char

        return ("Predicted word --> ", test_word)
    else:
        return ("Predicted word --> ")

In [103]:
test_word = 'godf' # specify test word
predicted_word = predict_word(test_word)
predicted_word

('Predicted word --> ', 'godfather')

In [95]:
from ipywidgets import interact
import ipywidgets as widgets

In [101]:
my_testbox_widget = widgets.Text(
    value='',
    placeholder='Type a word from a movie title',
    description='Title:',
    disabled=False
)
print("\nThe text box below enables title priction of movies")
print("\nType one character at a time and observe the predicted movie title\n")


interact(predict_word, 
         test_word = my_testbox_widget)


The text box below enables title priction of movies

Type one character at a time and observe the predicted movie title



interactive(children=(Text(value='', description='Title:', placeholder='Type a word from a movie title'), Outp…

<function __main__.predict_word(test_word)>