In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import urllib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import utils

In [2]:
movies_by_language = utils.get_dataset(
    feature_cols=['overview', 'title', 'tagline', 'processed_genres'], 
    parse_genres=True
)

overviews = movies_by_language["overview"].tolist()
titles = movies_by_language["title"].tolist()
poster_paths = movies_by_language["poster_path"].tolist()

corpus = movies_by_language["corpus"].tolist()

Summary of dataset
Size: 32269
First 10 rows of corpus:

0    [led, woody, andy's, toy, live, happily, room,...
1    [sibling, judy, peter, discover, enchanted, bo...
2    [family, wedding, reignites, ancient, feud, ne...
3    [cheated, mistreated, stepped, woman, holding,...
4    [george, bank, ha, recovered, daughter's, wedd...
5    [obsessive, master, thief, neil, mccauley, lea...
6    [ugly, duckling, undergone, remarkable, change...
7    [mischievous, young, boy, tom, sawyer, witness...
8    [international, action, superstar, jean, claud...
9    [james, bond, must, unmask, mysterious, head, ...
Name: corpus, dtype: object


In [4]:
# Build vocab
vocab  = []
tf_idf_corpus = []

for movie in corpus:
    tf_idf_corpus.append(" ".join(movie))
    
    for word in movie:
        if word not in vocab:
            vocab.append(word)

In [5]:
tf_idf_vec = TfidfVectorizer(lowercase=False, stop_words=None, vocabulary=vocab, smooth_idf=True, use_idf=True)
tf_idf     = tf_idf_vec.fit_transform(tf_idf_corpus)

In [8]:
movie_titles  = ["Ex Machina", "The Shawshank Redemption", "Prometheus", "The Dark Knight"]
movie_indices = [titles.index(t) for t in movie_titles]

# similarity matrix
sim = cosine_similarity(tf_idf)

In [10]:
for i in movie_indices:
    top_3_indices = list(sim[i].argsort()[::-1][1:4])
    print(f"Top 3 similar movies to {titles[i]}")
    print("====================================")
    for j in top_3_indices:
        print(f"- {titles[j]} (sim: {sim[i,j]})")
    print("====================================")
    print("\n\n")

Top 3 similar movies to Ex Machina
- A Simple Curve (sim: 0.2015577195265747)
- Fireproof (sim: 0.17949405270361088)
- Truth (sim: 0.17319993534900155)



Top 3 similar movies to The Shawshank Redemption
- They Made Me a Fugitive (sim: 0.1836285344575686)
- Prison (sim: 0.1730095466041786)
- Brubaker (sim: 0.15818306461108986)



Top 3 similar movies to Prometheus
- This Island Earth (sim: 0.22951591499665483)
- Iron Sky: The Coming Race (sim: 0.21371878039854103)
- AE: Apocalypse Earth (sim: 0.20562306852564713)



Top 3 similar movies to The Dark Knight
- The Dark Knight Rises (sim: 0.40329763186101514)
- Batman Begins (sim: 0.30032423665375285)
- Batman Beyond: Return of the Joker (sim: 0.2978930923947234)



