## Download `The movies` dataset from Kaggle

In [3]:
! set DATASET_FILE=`$(pwd)/the-movies-dataset.zip`
! [ ! -f ${DATASET_FILE} ] && kaggle datasets download -d rounakbanik/the-movies-dataset

zsh:1: permission denied: /Users/anhpham/development/aalto/snlp-22/the-movies-dataset.zip


## Extract dataset

In [4]:
! set DATASET_DIR=`$(pwd)/dataset`
! [ ! -d ${DATASET_DIR} ] && [! -L ${DATASET_DIR}] && mkdir -p ${DATASET_DIR} && unzip ${DATASET_FILE} -d ${DATASET_DIR}

zsh:1: permission denied: /Users/anhpham/development/aalto/snlp-22/dataset


## Import libraries

In [2]:
import pandas as pd
import numpy as np
import json
import re

import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity

In [1]:
import utils

movies_by_language = utils.get_dataset()

Summary of dataset
Size: 32269
First 10 rows of corpus:

0    [led, woody, andy's, toy, live, happily, room,...
1    [sibling, judy, peter, discover, enchanted, bo...
2    [family, wedding, reignites, ancient, feud, ne...
3    [cheated, mistreated, stepped, woman, holding,...
4    [george, bank, ha, recovered, daughter's, wedd...
5    [obsessive, master, thief, neil, mccauley, lea...
6    [ugly, duckling, undergone, remarkable, change...
7    [mischievous, young, boy, tom, sawyer, witness...
8    [international, action, superstar, jean, claud...
9    [james, bond, must, unmask, mysterious, head, ...
Name: corpus, dtype: object


In [7]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

## Load dataset

In [None]:
DATASET_PATH = "./dataset/"

df_metadata = pd.read_csv(DATASET_PATH + "movies_metadata.csv")

In [None]:
# filter columns
metadata_cols = ["id", "title", "overview", "tagline", "genres", "original_language", "poster_path"]
df_metadata   = df_metadata[metadata_cols]

In [None]:
# filter language
in_english     = df_metadata["original_language"] == "en"
df_metadata_en = df_metadata[in_english]
df_metadata_en = df_metadata_en[[col for col in metadata_cols if col != "original_language"]]

In [None]:
print(len(df_metadata_en["overview"]))


overviews = []
for row in df_metadata_en["overview"]:
    overviews.append(str(row))

#print(len(overviews))

titles = []
for row in df_metadata_en["title"]:
    titles.append(str(row))


In [None]:
# Text cleaning / tokenizing finctions:
def remove_non_ascii(textstring):
    """This function removes non-ASCII characters from the text string"""
    return "".join(i for i in textstring if ord(i) < 128)


def tokenize_and_lowercase(textstring):
    """This function tokenizes the text string into lowercased tokens with Regex Tokenizer"""
    tokens = []
    tokenized_into_sentences = sent_tokenize(textstring)
    for e in tokenized_into_sentences:
        regex_tokenizer = re.compile("\w+[-'.]\w+|\w+")  # tokenizer removes whitespaces and punctuation
        tokenized = re.findall(regex_tokenizer, e)
        for e in tokenized:
            tokens.append(e.lower())
    return tokens


def remove_stop_words(tokenized_text, stop_words):
    """This function removes stop words from lowercased tokenized text"""
    clean_text = [x for x in tokenized_text if x not in stop_words]
    return clean_text

# Cleaning the movie overviews:
stop_words = set(stopwords.words("english"))  # setting the list of stopwords to get rid of
clean_overviews = []

for overview in overviews:
    overview_ascii = remove_non_ascii(overview)
    overview_tokenized_and_lowercased = tokenize_and_lowercase(overview_ascii)
    overview_clean = remove_stop_words(overview_tokenized_and_lowercased, stop_words)
    clean_overviews.append(overview_clean)

# Examples of output:
print(len(clean_overviews))

corpus = clean_overviews
print(corpus[9])
print(len(corpus))

In [31]:
corpus = movies_by_language["corpus"]

'Jumanji'

In [4]:
# Loading pretrained word embedding from GoogleNews:

embeddings = api.load('word2vec-google-news-300', return_path=True)
GoogleNews_word2vec = KeyedVectors.load_word2vec_format(embeddings, binary=True)

In [5]:
# Training our corpus with GoogleNews embedding (CBOW architecture)
# [source for this piece: https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html]:

GoogleNews_model = Word2Vec(vector_size=300, window=5, min_count=2, workers=-1, sg=0) # sg=0 indicates CBOW architecture
GoogleNews_model.build_vocab(corpus)
GoogleNews_model.wv.vectors_lockf = np.ones(len(GoogleNews_model.wv), dtype=np.float32)
GoogleNews_model.wv.intersect_word2vec_format(embeddings, binary=True)
GoogleNews_model.train(corpus, total_examples=GoogleNews_model.corpus_count, epochs=5)

(0, 0)

In [6]:
# Generating Word2Vec embeddings for each overview (CBOW architecture)
# [source for this piece: https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html]:

def vectors(clean_overviews):
    global word_embeddings
    word_embeddings = []
    
    for overview in clean_overviews:
        word2vec = None
        count = 0
        for word in overview:
          #  print(word)
            if word in GoogleNews_model.wv.key_to_index:
                count += 1
                if word2vec is None:
                    word2vec = GoogleNews_model.wv.key_to_index[word]
                else:
                    word2vec = word2vec + GoogleNews_model.wv.key_to_index[word]
                
        if word2vec is not None:
            word2vec = word2vec / count
        
        word_embeddings.append(word2vec)
    
    return word_embeddings

In [2]:
# Function for recommending the top 5 similar movies (CBOW architecture):

def recommending(title):
    vectors_list=vectors(corpus.tolist())
    idx=movies_by_language[movies_by_language["title"] == title].index.tolist()[0]
   # print(idx)
    length = len(vectors_list)
    similarities = []
    for i in range(0,length-1):
        try:
            similarity = cosine_similarity(np.array(vectors_list[idx]).reshape(1, -1), 
                                           np.array(vectors_list[i]).reshape(1, -1))
            print(similarity)
            similarities.append(similarity[0][0])
        except ValueError:
            similarities.append(0)
        
    similarities_dict={}
    for count, value in enumerate(similarities):
        similarities_dict[count]=value

    print(similarities)
        
    sorted_similarities_dict = dict(sorted(similarities_dict.items(),
                           key=lambda item: item[1],
                           reverse=True))
    
    indices_top=list(sorted_similarities_dict.keys())[1:6]
    values_top=list(sorted_similarities_dict.values())
    print("Top 5 movies most similar to", title)
    print("===================================")
    for index in indices_top:
        print(movies_by_language["title"][index])
        print("\n")
        print(movies_by_language["overview"][index])
        print("\n")
        print("Accuracy: {:.4f}".format(values_top[index]))
        print("\n")

In [3]:
# Testing the recommendations (CBOW architecture):

title="GoldenEye"
recommending(title)

title="Deathline"
recommending(title)

NameError: name 'vectors' is not defined

In [None]:
# Training our corpus with GoogleNews embedding (Skip-Gram architecture)
# [source for this piece: https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html]:

GoogleNews_model_SG = Word2Vec(vector_size=300, window=5, min_count=2, workers=-1, sg=1) # sg=1 indicates Skip-Gram architecture
GoogleNews_model_SG.build_vocab(corpus)
GoogleNews_model_SG.wv.vectors_lockf = np.ones(len(GoogleNews_model_SG.wv), dtype=np.float32)
GoogleNews_model_SG.wv.intersect_word2vec_format(embeddings, lockf=1.0, binary=True)
GoogleNews_model_SG.train(corpus, total_examples=GoogleNews_model_SG.corpus_count, epochs=5)

In [None]:
# Generating Word2Vec embeddings for each overview (Skip-Gram architecture)
# [source for this piece: https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html]:

def vectors_SG(clean_overviews):
    global word_embeddings_SG
    word_embeddings_SG = []
    
    for overview in clean_overviews:
        word2vec = None
        count = 0
        for word in overview:
          #  print(word)
            if word in GoogleNews_model_SG.wv.key_to_index:
                count += 1
                if word2vec is None:
                    word2vec = GoogleNews_model_SG.wv.key_to_index[word]
                else:
                    word2vec = word2vec + GoogleNews_model_SG.wv.key_to_index[word]
                
        if word2vec is not None:
            word2vec = word2vec / count
        
        word_embeddings_SG.append(word2vec)
    
    return word_embeddings_SG

In [None]:
# Function for recommending the top 5 similar movies (Skip-Gram architecture):

def recommending(title):
    vectors_list=vectors_SG(clean_overviews)
    idx=titles.index(title)
   # print(idx)
    length = len(vectors_list)
    similarities = []
    for i in range(0,length-1):
        try:
            similarity = cosine_similarity(np.array(vectors_list[idx]).reshape(1, -1), 
                                           np.array(vectors_list[i]).reshape(1, -1))
            similarities.append(similarity[0][0])
        except ValueError:
            similarities.append(0)
        
    similarities_dict={}
    for count, value in enumerate(similarities):
        similarities_dict[count]=value
        
    sorted_similarities_dict = dict(sorted(similarities_dict.items(),
                           key=lambda item: item[1],
                           reverse=True))
    
    indices_top=list(sorted_similarities_dict.keys())[1:6]
    print("Top 5 movies most similar to", title)
    print("===================================")
    for index in indices_top:
        print(titles[index])
        print("\n")
        print(overviews[index])
        print("\n")

In [None]:
# Testing the recommendations (Skip-Gram architecture):

title="Deathline"
recommending(title)


# https://image.tmdb.org/t/p/original/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg

# TF_IDF

In [None]:
# parse names from genres column
genre_names               = list(map(lambda g: sorted(re.findall("'name':\s*'(\w*)'", g)), df_metadata_en["genres"]))
genre_names_joined        = list(map(lambda g: " ".join(g), genre_names))
df_metadata_en["genres_"] = genre_names_joined

# combine features for tf-idf
df_metadata_en["document"] = (
    df_metadata_en["title"].astype(str) + ". " #+ 
    # df_metadata_en["overview"].astype(str) + ". " + 
    # df_metadata_en["tagline"].astype(str) + ". " + 
    # df_metadata_en["genres_"].astype(str)
)

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("omw-1.4")

lemmatizer = nltk.wordnet.WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

import spacy
from spacy.tokens import Token

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

def get_ents(doc):
    return ["_".join(e.text.upper().split()) for e in nlp(doc).ents]

In [None]:
corpus = list(df_metadata_en["document"])
vocab  = {}

# pre-process corpus (named-entity recognition, stopword removal, lemmatization)
for i, doc in enumerate(corpus):
    # split to tokens and remove stopwords
    tokens = [t for t in word_tokenize(doc.lower()) if t not in stop_words]
    # lemmatization
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # prepend entities
    tokens = get_ents(doc) + tokens
    # join tokens with whitespace and finish pre-processing
    corpus[i] = " ".join(tokens)
    for t in tokens:
        if t not in vocab:
            vocab[t] = True

# generate vocab
vocab = list(dict.keys(vocab))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# calc tf-idf
tf_idf_vec = TfidfVectorizer(lowercase=False, stop_words=None, vocabulary=vocab, smooth_idf=True, use_idf=True)
tf_idf     = tf_idf_vec.fit_transform(corpus)

titles        = list(df_metadata_en["title"])
movie_titles  = ["Ex Machina", "The Shawshank Redemption", "Prometheus", "The Dark Knight"]
movie_indices = [titles.index(t) for t in movie_titles]

# print first doc
for i in movie_indices:
    df = pd.DataFrame(tf_idf[i].T.todense(), index=tf_idf_vec.get_feature_names_out(), columns=["TF-IDF"])
    x = df.sort_values("TF-IDF", ascending=False).head(3)
    print(titles[i])
    print("=============")
    print(x)
    print("=============")

In [None]:
# similarity matrix
sim = cosine_similarity(tf_idf)

In [None]:
for i in movie_indices:
    top_3_indices = list(sim[i].argsort()[::-1][1:4])
    print(f"Top 3 similar movies to {titles[i]}")
    print("====================================")
    for j in top_3_indices:
        print(f"- {titles[j]} (sim: {sim[i,j]})")
    print("====================================") 