In [208]:
import json
import keras
import numpy as np
#import nltk
import re
#from nltk.corpus import stopwords
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
#from keras.optimizers import Adadelta
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping

In [209]:
#Extract list of genres for every record and pre-process it.
def extractGenres(row):
    json_dict_list = json.loads(row.genres)
    genres = [d['name'].lower() for d in json_dict_list if 'name' in d]
    return " ".join(genres)

# Read dataset file and extract features and labels from it.
def readFile(file_path):
    df = pd.read_csv(file_path)
    X = []
    for row in df.itertuples():
        plot = str(row.overview)
        plot_chars = re.sub("[^a-zA-Z]", " ", plot)
        X.append(plot_chars)
    Y = [extractGenres(row).rstrip() for row in df.itertuples()]
    return X,Y

# Tokenize features and labels into one-hot matrics
def tokenize(X, Y, XTokenizer, YTokenizer):
    
    plot_matrix = XTokenizer.texts_to_matrix(X)
    genre_matrix = YTokenizer.texts_to_matrix(Y)
    
    return plot_matrix, genre_matrix

In [210]:
file_path = 'tmdb_5000_movies.csv'
X, Y = readFile(file_path)

XTokenizer = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
YTokenizer = Tokenizer(filters = ' ', lower = True,)
    
XTokenizer.fit_on_texts(X)
YTokenizer.fit_on_texts(Y)

plot_matrix, genre_matrix = tokenize(X, Y, XTokenizer, YTokenizer)

input_size = plot_matrix.shape[1]
output_size = genre_matrix.shape[1]

print(plot_matrix.shape)
print(genre_matrix.shape)

(4803, 20937)
(4803, 23)


In [211]:
split_ratio = 0.9
split_index = int(plot_matrix.shape[0] * split_ratio)

X_train, X_test = np.array(plot_matrix[: split_index]), np.array(plot_matrix[split_index : ])
Y_train, Y_test = np.array(genre_matrix[: split_index]), np.array(genre_matrix[split_index : ])

In [212]:
model = Sequential()
model.add(Dense(512, activation = 'relu', input_dim = input_size))
model.add(Dropout(0.4))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(output_size, activation = 'sigmoid'))
model.summary()

#adadelta = Adadelta(lr =1.0, rho = 0.95, decay = 0.0)
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop', metrics = ['top_k_categorical_accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 512)               10720256  
_________________________________________________________________
dropout_33 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_46 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_34 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_35 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_48 (Dense)             (None, 23)                2967      
Total para

In [213]:
checkpoint = ModelCheckpoint(filepath=None, monitor = 'val_loss', mode = 'min', verbose = 1)

In [214]:
model.fit(X_train, Y_train, epochs = 10, batch_size = 128, verbose = 1, callbacks= [])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a4e3b8ba8>

In [215]:
score = model.evaluate(X_test, Y_test, verbose = 0)
print("Accuracy %s"% score)
model.save('models/el_model_77_acc.h5')

Accuracy [5.595424637229428, 0.760914760914761]


In [216]:
def preprocessTestData(comments):
    clean_comments = []
    for comment in comments:
        comment_chars = re.sub("[^a-zA-Z]", " ", comment)
        clean_comments.append(comment_chars.rstrip())
    return clean_comments

In [239]:
comments = ['I would have been the first, but i did not have a gun. not really though, zac snyder is just a clown.',
            'I love my dog',
            'I believe in humanity',
            'I feel suicidal today.']
comments = preprocessTestData(comments)
comments_test = XTokenizer.texts_to_matrix(comments)
y_probs = model.predict(np.array(comments_test))

In [218]:
top_k = 3
prob_to_indices = lambda y_prob, k: np.argpartition(y_prob, -k)[-k : ]

In [240]:
genre_to_index_dict = YTokenizer.word_index
index_to_genre = [0]*output_size
genre_to_index_dict[''] = 0
for k, v in genre_to_index_dict.items():
    index_to_genre[v] = k
indices_to_genres = lambda ind : [index_to_genre[i] for i in ind]

In [241]:
output_genres = [indices_to_genres(prob_to_indices(p, top_k)) for p in y_probs]

In [243]:
for out in output_genres:
    print(out)

['drama', 'crime', 'thriller']
['romance', 'comedy', 'drama']
['action', 'thriller', 'drama']
['crime', 'drama', 'thriller']


In [244]:
dframe = pd.read_csv(file_path)
genre_to_movies_dict = {} # dictionary mapping each genre to its corresponding list of movies.
movies_popularity_dict = {} # storing movies with their corresponding calculated scores.
index_to_movie_list = [] # mapping index to corresponding movie.

# Parameters for calculating weighted review:
C = 6.9 # mean vote across whole report
M = 3000 # minimum votes required to be listed in Top 250 by IMDB.
for row in dframe.itertuples():
    V = row.vote_count
    R = row.vote_average
    WR = (V/(V+M))*R + (M/(V+M))*C
    movies_popularity_dict[row.title] = WR
    index_to_movie_list.append(row.title)
    genre_map = json.loads(row.genres)
    genres = [d['name'].lower() for d in genre_map if 'name' in d]
    for genre in genres:
        if not genre in genre_to_movies_dict:
            genre_to_movies_dict[genre] = []
        genre_to_movies_dict[genre].append(row.title)
movie_to_index = {}
for i, movie in enumerate(movies_popularity_dict):
    movie_to_index[movie] = i

In [245]:
def genres_to_top_k_movies(genres, k=5):
    counter = [0]*len(movies_popularity_dict)
    for genre in genres:
        for movie in genre_to_movies_dict[genre]:
            counter[movie_to_index[movie]] += 1
    count_pop_tuple_list = []
    for i, count in enumerate(counter):
        tuple = (count, movies_popularity_dict[index_to_movie_list[i]], index_to_movie_list[i])
        count_pop_tuple_list.append(tuple)
    final_sorted_tuple_list = sorted(count_pop_tuple_list, key = lambda tuple: (tuple[0], tuple[1])) 
    output_movies = []
    iter = 0
    for tuple in reversed(final_sorted_tuple_list):
        iter += 1
        if iter <= 5:
            output_movies.append(tuple[2])
        else:
            break
    return output_movies

In [246]:
for out in output_genres:
    output_movies = genres_to_top_k_movies(out, k=5)
    print(output_movies)

['The Dark Knight', 'The Silence of the Lambs', 'The Departed', 'Scarface', 'The Dark Knight Rises']
['Forrest Gump', 'Moonrise Kingdom', 'The Terminator', 'Groundhog Day', 'E.T. the Extra-Terrestrial']
['The Dark Knight', 'Inglourious Basterds', 'Scarface', 'The Dark Knight Rises', 'Captain Phillips']
['The Dark Knight', 'The Silence of the Lambs', 'The Departed', 'Scarface', 'The Dark Knight Rises']
