# Movie recommendation system
Explore different models for semantic text matching.


In [1]:
import pandas as pd
import numpy as np
import nltk

# Load dataset
Data is already prepared in folder ``data`` of this repository. Source: https://www.kaggle.com/rounakbanik/the-movies-dataset/movies_metadata.


In [3]:
DATAPATH = '../data/movies_metadata.csv' 

df = pd.read_csv(DATAPATH)
df = df[~df.overview.isna()]
df.rename(columns={'overview':'sentence'}, inplace=True)
print(len(df))
df = df.iloc[:20000]
df.head()

44512


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,sentence,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    print('Cleaning sentences...')
    df['clean_sentence'] = df['sentence'].apply(clean_text)
    df['tok_lem_sentence'] = df['clean_sentence'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df
    
df = clean_sentences(df)

Cleaning sentences...


In [5]:
print(len(df))
df[['sentence', 'clean_sentence', 'tok_lem_sentence']]

20000


Unnamed: 0,sentence,clean_sentence,tok_lem_sentence
0,"Led by Woody, Andy's toys live happily in his ...",led by woody andy toys live happily in his r...,"[led, by, woody, andy, toy, live, happily, in,..."
1,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...,"[when, sibling, judy, and, peter, discover, an..."
2,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...,"[a, family, wedding, reignites, the, ancient, ..."
3,"Cheated on, mistreated and stepped on, the wom...",cheated on mistreated and stepped on the wom...,"[cheated, on, mistreated, and, stepped, on, th..."
4,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...,"[just, when, george, bank, ha, recovered, from..."
...,...,...,...
20131,"After a lifetime of hiding, Chely Wright becom...",after a lifetime of hiding chely wright becom...,"[after, a, lifetime, of, hiding, chely, wright..."
20132,"In 1989, five black and Latino teenagers from ...",in 1989 five black and latino teenagers from ...,"[in, 1989, five, black, and, latino, teenager,..."
20133,Arkin escapes with his life from the vicious g...,arkin escapes with his life from the vicious g...,"[arkin, escape, with, his, life, from, the, vi..."
20134,"Remake of a hit film from 1990, ""The Cherry Or...",remake of a hit film from 1990 the cherry or...,"[remake, of, a, hit, film, from, 1990, the, ch..."


In [13]:
#df[['clean_sentence']].to_csv('clean_movies.txt', index=False,header=False)

## Query sentence

In [6]:
query_sentence = 'a crime story with a beautiful woman' 

pd.options.display.max_colwidth = 500

## Util function

In [7]:
def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]  
    return best_index

# TF-IDF

## Training

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Adapt stop words
token_stop = tokenizer(' '.join(STOPWORDS), lemmatize=False)

# Fit TFIDF
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer) 
tfidf_mat = vectorizer.fit_transform(df['sentence'].values) # -> (num_sentences, num_vocabulary)
tfidf_mat.shape



(20000, 49932)

## Prediction

In [9]:
query_sentence

'a crime story with a beautiful woman'

In [10]:
def get_recommendations_tfidf(sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    # Embed the query sentence
    tokens = [str(tok) for tok in tokenizer(sentence)]
    vec = vectorizer.transform(tokens)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    print(mat.shape)
    best_index = extract_best_indices(mat, topk=3)
    return best_index


best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)

display(df[['original_title', 'genres', 'sentence']].iloc[best_index])


(7, 20000)


Unnamed: 0,original_title,genres,sentence
9003,Innocent Blood,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",A beautiful vampire turns a crime lord into a creature of the night.
10493,Requiem pour un vampire,"[{'id': 27, 'name': 'Horror'}]",A vampire lures beautiful young women to his castle in Europe.
18224,Miss Bala,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]",The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.


# Spacy

## Load pretrained

In [None]:
import spacy

#!python -m spacy download en_core_web_lg

#Load pre-trained model
#nlp = spacy.load("en_core_web_lg") 
import en_core_web_sm

nlp = en_core_web_sm.load()
# Apply the model to the sentences
df['spacy_sentence'] = df['sentence'].apply(lambda x: nlp(x)) 
# Retrieve the embedded vectors as a matrix 
embed_mat = df['spacy_sentence'].values
embed_mat.shape

In [None]:
def predict_spacy(model, query_sentence, embed_mat, topk=5):
    """
    Predict the topk sentences after applying spacy model.
    """
    query_embed = model(query_sentence)
    mat = np.array([query_embed.similarity(line) for line in embed_mat])
    # keep if vector has a norm
    mat_mask = np.array(
        [True if line.vector_norm else False for line in embed_mat])
    best_index = extract_best_indices(mat, topk=topk, mask=mat_mask)
    return best_index

# Predict
predict_spacy(nlp, query_sentence, embed_mat)
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

  mat = np.array([query_embed.similarity(line) for line in embed_mat])


Unnamed: 0,original_title,genres,sentence
9003,Innocent Blood,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",A beautiful vampire turns a crime lord into a creature of the night.
10493,Requiem pour un vampire,"[{'id': 27, 'name': 'Horror'}]",A vampire lures beautiful young women to his castle in Europe.
18224,Miss Bala,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]",The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.


# Word2Vec

## Training

In [11]:
from gensim.models.word2vec import Word2Vec

# Create model
word2vec_model = Word2Vec(min_count=0, workers = 8, vector_size=300) 
# Prepare vocab
word2vec_model.build_vocab(df.tok_lem_sentence.values)
# Train
word2vec_model.train(df.tok_lem_sentence.values, total_examples=word2vec_model.corpus_count, epochs=30)

(25184905, 33081030)

## Predict

In [12]:
def is_word_in_model(word, model):
    """
    Check on individual words ``word`` that it exists in ``model``.
    """
    assert type(model).__name__ == 'KeyedVectors'
    is_in_vocab = word in model.key_to_index.keys()
    return is_in_vocab

def predict_w2v(query_sentence, dataset, model, topk=3):
    query_sentence = query_sentence.split()
    in_vocab_list, best_index = [], [0]*topk
    for w in query_sentence:
        # remove unseen words from query sentence
        if is_word_in_model(w, model.wv):
            in_vocab_list.append(w)
    # Retrieve the similarity between two words as a distance
    if len(in_vocab_list) > 0:
        sim_mat = np.zeros(len(dataset))  # TO DO
        for i, data_sentence in enumerate(dataset):
            if data_sentence:
                sim_sentence = model.wv.n_similarity(
                        in_vocab_list, data_sentence)
            else:
                sim_sentence = 0
            sim_mat[i] = np.array(sim_sentence)
        # Take the five highest norm
        best_index = np.argsort(sim_mat)[::-1][:topk]
    return best_index

# Predict
best_index = predict_w2v(query_sentence, df['tok_lem_sentence'].values, word2vec_model)    
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

Unnamed: 0,original_title,genres,sentence
9232,苏州河,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}, {'id': 10749, 'name': 'Romance'}]",A tragic love story set in contemporary Shanghai. The film stars Zhou Xun in a dual role as two different women and Jia Hongsheng as a man obsessed with finding a woman from his past.
602,Heavy Metal,"[{'id': 16, 'name': 'Animation'}, {'id': 878, 'name': 'Science Fiction'}]","A glowing orb terrorizes a young girl with a collection of stories of dark fantasy, eroticism and horror."
10786,La sirène du Mississipi,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]","Adapted from a story by William Irish, it's a noirish tale of a man who orders a mail-order bride but receives instead a con woman."


# Transformers 

## Load pretrained

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
corpus_embeddings = model.encode(df.sentence.values, convert_to_tensor=True)
query_embedding = model.encode(query_sentence, convert_to_tensor=True)


## Predict

In [None]:
import torch

# We use cosine-similarity and torch.topk to find the highest 3 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=3)

print("\n\n======================\n\n")
print("Query:", query_sentence)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy() 
    idx = idx.cpu().data.numpy()
    display(df[['original_title', 'genres', 'sentence']].iloc[idx])  

### CODE REFERENCES: 
https://github.com/topics/movie-recommendation-system