In [60]:
import numpy as np

import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
import urllib

from sklearn.metrics.pairwise import cosine_similarity

import utils

In [47]:
movies_by_language = utils.get_dataset()

overviews = movies_by_language["overview"].tolist()
titles = movies_by_language["title"].tolist()
poster_paths = movies_by_language["poster_path"].tolist()

corpus = movies_by_language["corpus"].tolist()

Summary of dataset
Size: 32269
First 10 rows of corpus:

0    [led, woody, andy's, toy, live, happily, room,...
1    [sibling, judy, peter, discover, enchanted, bo...
2    [family, wedding, reignites, ancient, feud, ne...
3    [cheated, mistreated, stepped, woman, holding,...
4    [george, bank, ha, recovered, daughter's, wedd...
5    [obsessive, master, thief, neil, mccauley, lea...
6    [ugly, duckling, undergone, remarkable, change...
7    [mischievous, young, boy, tom, sawyer, witness...
8    [international, action, superstar, jean, claud...
9    [james, bond, must, unmask, mysterious, head, ...
Name: corpus, dtype: object


In [3]:
# Loading pretrained word embeddings from GoogleNews:
embeddings = api.load('word2vec-google-news-300', return_path=True)
GN_word2vec = KeyedVectors.load_word2vec_format(embeddings, binary=True)

In [4]:
# Training our corpus with GoogleNews embedding (CBOW architecture)
cbow_model = Word2Vec(vector_size=300, window=5, min_count=2, workers=-1, sg=0) # sg=0 indicates CBOW architecture

cbow_model.build_vocab(corpus)
cbow_model.wv.vectors_lockf = np.ones(len(cbow_model.wv), dtype=np.float32)
cbow_model.wv.intersect_word2vec_format(embeddings, lockf=1.0, binary=True)

cbow_model.train(corpus, total_examples=cbow_model.corpus_count, epochs=5)

(0, 0)

In [41]:
# Generating Word2Vec embeddings for each overview (CBOW architecture)

embedded = utils.word_embeddings_vectorize(cbow_model, corpus)

In [63]:
def predict(title):
    input_idx = titles.index(title)
    outputs = utils.word_embeddings_predict(embedded, input_idx, 5)

    print("Top 5 movies most similar to", title)
    print("===================================")

    for index, value in outputs:
        title = "{} - Accuracy: {:.4f}".format(titles[index], value)

        try:
            url = "https://image.tmdb.org/t/p/original" + poster_paths[index]
            f = urllib.request.urlopen(url)
        except urllib.request.HTTPError:
            print(title)
        else:
            a = plt.imread(f)
            plt.imshow(a)
            
            plt.title(title)
            # print r.url, 'downloaded successfully'


In [65]:
title="GoldenEye"
predict(title)

print("\n\n\n")

title="Deathline"
predict(title)

Top 5 movies most similar to  Toy Story
The Beautiful Beast - Accuracy: 0.7632
Totally Awesome - Accuracy: 0.7618
Motivational Growth - Accuracy: 0.7610
VeggieTales: Duke and the Great Pie War - Accuracy: 0.7585
Scooby-Doo! Stage Fright - Accuracy: 0.7583




Top 5 movies most similar to  Deathline
Ninja: Shadow of a Tear - Accuracy: 0.8741
The Stranger - Accuracy: 0.8527
Gag - Accuracy: 0.8499
B. Monkey - Accuracy: 0.8461
Dead Hooker in a Trunk - Accuracy: 0.8451
