In [1]:
pip install pandas numpy sklearn nltk spacy && python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 3.0 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import libraries
import pandas as pd
import numpy as np
import re

In [3]:
BASE_PATH = "./dataset/"
# load dataset
df_metadata = pd.read_csv(BASE_PATH + "movies_metadata.csv")

  df_metadata = pd.read_csv(BASE_PATH + "movies_metadata.csv")


In [4]:
# filter columns
metadata_cols = ["id", "title", "overview", "tagline", "genres", "original_language", "poster_path"]
df_metadata   = df_metadata[metadata_cols]

In [5]:
# filter language
in_english     = df_metadata["original_language"] == "en"
df_metadata_en = df_metadata[in_english]

In [6]:
# parse names from genres column
genre_names               = list(map(lambda g: sorted(re.findall("'name':\s*'(\w*)'", g)), df_metadata_en["genres"]))
genre_names_joined        = list(map(lambda g: " ".join(g), genre_names))
df_metadata_en["genres_"] = genre_names_joined

# combine features for tf-idf
df_metadata_en["document"] = (
    df_metadata_en["title"].astype(str) + ". " + 
    df_metadata_en["overview"].astype(str) + ". " + 
    df_metadata_en["tagline"].astype(str) + ". " + 
    df_metadata_en["genres_"].astype(str)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata_en["genres_"] = genre_names_joined
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata_en["document"] = (


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("omw-1.4")

lemmatizer = nltk.wordnet.WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

import spacy
from spacy.tokens import Token

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

def get_ents(doc):
    return ["_".join(e.text.upper().split()) for e in nlp(doc).ents]

[nltk_data] Downloading package punkt to /Users/jpturunen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jpturunen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jpturunen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jpturunen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
corpus = list(df_metadata_en["document"])
vocab  = {}

# pre-process corpus (named-entity recognition, stopword removal, lemmatization)
for i, doc in enumerate(corpus):
    # split to tokens and remove stopwords
    tokens = [t for t in word_tokenize(doc.lower()) if t not in stop_words]
    # lemmatization
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # prepend entities
    tokens = get_ents(doc) + tokens
    # join tokens with whitespace and finish pre-processing
    corpus[i] = " ".join(tokens)
    for t in tokens:
        if t not in vocab:
            vocab[t] = True

# generate vocab
vocab = list(dict.keys(vocab))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# calc tf-idf
tf_idf_vec = TfidfVectorizer(lowercase=False, stop_words=None, vocabulary=vocab, smooth_idf=True, use_idf=True)
tf_idf     = tf_idf_vec.fit_transform(corpus)

titles        = list(df_metadata_en["title"])
movie_titles  = ["Ex Machina", "The Shawshank Redemption", "Prometheus", "The Dark Knight"]
movie_indices = [titles.index(t) for t in movie_titles]

# print first doc
for i in movie_indices:
    df = pd.DataFrame(tf_idf[i].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
    x = df.sort_values("TF-IDF", ascending=False).head(3)
    print(titles[i])
    print("=============")
    print(x)
    print("=============")

## TODO: genres not appearing as often as I would like... maybe the tf-idf value should then always be 1 for those tokens?

Ex Machina
         TF-IDF
CALEB  0.344829
caleb  0.335572
coder  0.205131
The Shawshank Redemption
             TF-IDF
dufresne   0.388159
shawshank  0.373435
prisoner   0.216970
Prometheus
                                           TF-IDF
THE_SEARCH_FOR_OUR_BEGINNING_COULD_LEAD  0.321738
ADVENTURE_MYSTERY                        0.309533
prometheus                               0.288669
The Dark Knight
            TF-IDF
BATMAN    0.292623
batman    0.287650
criminal  0.198538


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# similarity matrix
sim = cosine_similarity(tf_idf)

In [11]:
for i in movie_indices:
    top_3_indices = list(sim[i].argsort()[::-1][1:4])
    print(f"Top 3 similar movies to {titles[i]}")
    print("====================================")
    for j in top_3_indices:
        print(f"- {titles[j]} (sim: {sim[i,j]})")
    print("====================================") 
        

Top 3 similar movies to Ex Machina
- A Simple Curve (sim: 0.290768953558932)
- Fireproof (sim: 0.2540251619239815)
- Truth (sim: 0.24701012909830583)
Top 3 similar movies to The Shawshank Redemption
- They Made Me a Fugitive (sim: 0.1387145742623922)
- The Domino Principle (sim: 0.13345915893312696)
- Brubaker (sim: 0.1321365322724463)
Top 3 similar movies to Prometheus
- AE: Apocalypse Earth (sim: 0.22173645595530667)
- Star Trek: The Motion Picture (sim: 0.1930698473551296)
- Toward the Terra (sim: 0.18214594610512744)
Top 3 similar movies to The Dark Knight
- The Dark Knight Rises (sim: 0.40714367708047183)
- Batman Unmasked: The Psychology of the Dark Knight (sim: 0.35765476609403063)
- Batman Returns (sim: 0.347335569977196)
