# Movie recommendation system
Explore different models for semantic text matching.


In [1]:
import pandas as pd

# Load dataset
Data is already prepared in folder ``data`` of this repository. Source: https://www.kaggle.com/rounakbanik/the-movies-dataset.


In [2]:
DATAPATH = 'data/movies_metadata.csv' 

df = pd.read_csv(DATAPATH)
df = df[~df.overview.isna()]
df.rename(columns={'overview':'sentence'}, inplace=True)
print(len(df))
df = df.iloc[:20000]
df.head()

44512


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,sentence,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 

def clean_text(text):
    """
    Series of cleaning. String to lower case, remove non words characters and numbers.
        text (str): input text
    return (str): modified initial text
    """
    text = text.lower()  # lowercase text
    text = re.sub(PATTERN_S, ' ', text)
    text = re.sub(PATTERN_RN, ' ', text)
    text = re.sub(PATTERN_PUNC, ' ', text)
    return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words.
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    token = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                                        and w not in stopwords)]
    return tokens    


def clean_sentences(df):
    """
    Remove irrelavant characters (in new column clean_sentence).
    Lemmatize, tokenize words into list of words (in new column tok_lem_sentence).
    """
    print('Cleaning sentences...')
    df['clean_sentence'] = df['sentence'].apply(clean_text)
    df['tok_lem_sentence'] = df['clean_sentence'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df
    
df = clean_sentences(df)

Cleaning sentences...


In [4]:
print(len(df))
df[['sentence', 'clean_sentence', 'tok_lem_sentence']]

20000


Unnamed: 0,sentence,clean_sentence,tok_lem_sentence
0,"Led by Woody, Andy's toys live happily in his ...",led by woody andy toys live happily in his r...,"[led, by, woody, andy, toy, live, happily, in,..."
1,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...,"[when, sibling, judy, and, peter, discover, an..."
2,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...,"[a, family, wedding, reignites, the, ancient, ..."
3,"Cheated on, mistreated and stepped on, the wom...",cheated on mistreated and stepped on the wom...,"[cheated, on, mistreated, and, stepped, on, th..."
4,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...,"[just, when, george, bank, ha, recovered, from..."
...,...,...,...
20131,"After a lifetime of hiding, Chely Wright becom...",after a lifetime of hiding chely wright becom...,"[after, a, lifetime, of, hiding, chely, wright..."
20132,"In 1989, five black and Latino teenagers from ...",in 1989 five black and latino teenagers from ...,"[in, 1989, five, black, and, latino, teenager,..."
20133,Arkin escapes with his life from the vicious g...,arkin escapes with his life from the vicious g...,"[arkin, escape, with, his, life, from, the, vi..."
20134,"Remake of a hit film from 1990, ""The Cherry Or...",remake of a hit film from 1990 the cherry or...,"[remake, of, a, hit, film, from, 1990, the, ch..."


## Query sentence

In [5]:
query_sentence = 'a crime story with a beautiful woman' 

pd.options.display.max_colwidth = 500

## Util function

In [6]:
def extract_best_indices(m, topk, mask=None):
    """
    Use sum of the cosine distance over all tokens.
    m (np.array): cos matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to lowest in order)
    """
    # return the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis=0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest idx to smallest score 
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
    mask = np.logical_or(cos_sim[index] != 0, mask) #eliminate 0 cosine distance
    best_index = index[mask][:topk]  
    return best_index

# TF-IDF

## Training

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Adapt stop words
token_stop = tokenizer(' '.join(STOPWORDS), lemmatize=False)

# Fit TFIDF
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer) 
tfidf_mat = vectorizer.fit_transform(df['sentence'].values) # -> (num_sentences, num_vocabulary)
tfidf_mat.shape



(20000, 49931)

## Prediction

In [8]:
query_sentence

'a crime story with a beautiful woman'

In [9]:
def get_recommendations_tfidf(sentence, tfidf_mat):
    
    """
    Return the database sentences in order of highest cosine similarity relatively to each 
    token of the target sentence. 
    """
    # Embed the query sentence
    tokens = [str(tok) for tok in tokenizer(sentence)]
    vec = vectorizer.transform(tokens)
    # Create list with similarity between query and dataset
    mat = cosine_similarity(vec, tfidf_mat)
    # Best cosine distance for each token independantly
    print(mat.shape)
    best_index = extract_best_indices(mat, topk=3)
    return best_index


best_index = get_recommendations_tfidf(query_sentence, tfidf_mat)

display(df[['original_title', 'genres', 'sentence']].iloc[best_index])


(7, 20000)


Unnamed: 0,original_title,genres,sentence
9003,Innocent Blood,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",A beautiful vampire turns a crime lord into a creature of the night.
10493,Requiem pour un vampire,"[{'id': 27, 'name': 'Horror'}]",A vampire lures beautiful young women to his castle in Europe.
18224,Miss Bala,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]",The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.


# Spacy

## Load pretrained

In [10]:
import spacy

# !python -m spacy download en_core_web_lg

#Load pre-trained model
nlp = spacy.load("en_core_web_lg") 
# Apply the model to the sentences
df['spacy_sentence'] = df['sentence'].apply(lambda x: nlp(x)) 
# Retrieve the embedded vectors as a matrix 
embed_mat = df['spacy_sentence'].values
embed_mat.shape

(20000,)

In [11]:
def predict_spacy(model, query_sentence, embed_mat, topk=5):
    """
    Predict the topk sentences after applying spacy model.
    """
    query_embed = model(query_sentence)
    mat = np.array([query_embed.similarity(line) for line in embed_mat])
    # keep if vector has a norm
    mat_mask = np.array(
        [True if line.vector_norm else False for line in embed_mat])
    best_index = extract_best_indices(mat, topk=topk, mask=mat_mask)
    return best_index

# Predict
predict_spacy(nlp, query_sentence, embed_mat)
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

  mat = np.array([query_embed.similarity(line) for line in embed_mat])


Unnamed: 0,original_title,genres,sentence
9003,Innocent Blood,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",A beautiful vampire turns a crime lord into a creature of the night.
10493,Requiem pour un vampire,"[{'id': 27, 'name': 'Horror'}]",A vampire lures beautiful young women to his castle in Europe.
18224,Miss Bala,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]",The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.


# Word2Vec

## Training

In [12]:
from gensim.models.word2vec import Word2Vec

# Create model
word2vec_model = Word2Vec(min_count=0, workers = 8, vector_size=300) 
# Prepare vocab
word2vec_model.build_vocab(df.tok_lem_sentence.values)
# Train
word2vec_model.train(df.tok_lem_sentence.values, total_examples=word2vec_model.corpus_count, epochs=30)



(25186881, 33081030)

## Predict

In [14]:
def is_word_in_model(word, model):
    """
    Check on individual words ``word`` that it exists in ``model``.
    """
    assert type(model).__name__ == 'KeyedVectors'
    is_in_vocab = word in model.key_to_index.keys()
    return is_in_vocab

def predict_w2v(query_sentence, dataset, model, topk=3):
    query_sentence = query_sentence.split()
    in_vocab_list, best_index = [], [0]*topk
    for w in query_sentence:
        # remove unseen words from query sentence
        if is_word_in_model(w, model.wv):
            in_vocab_list.append(w)
    # Retrieve the similarity between two words as a distance
    if len(in_vocab_list) > 0:
        sim_mat = np.zeros(len(dataset))  # TO DO
        for i, data_sentence in enumerate(dataset):
            if data_sentence:
                sim_sentence = model.wv.n_similarity(
                        in_vocab_list, data_sentence)
            else:
                sim_sentence = 0
            sim_mat[i] = np.array(sim_sentence)
        # Take the five highest norm
        best_index = np.argsort(sim_mat)[::-1][:topk]
    return best_index

# Predict
best_index = predict_w2v(query_sentence, df['tok_lem_sentence'].values, word2vec_model)    
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

Unnamed: 0,original_title,genres,sentence
9232,苏州河,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}, {'id': 10749, 'name': 'Romance'}]",A tragic love story set in contemporary Shanghai. The film stars Zhou Xun in a dual role as two different women and Jia Hongsheng as a man obsessed with finding a woman from his past.
14464,Printed Rainbow,"[{'id': 16, 'name': 'Animation'}]",A matchbox collection unites a lonely woman and her cat.
602,Heavy Metal,"[{'id': 16, 'name': 'Animation'}, {'id': 878, 'name': 'Science Fiction'}]","A glowing orb terrorizes a young girl with a collection of stories of dark fantasy, eroticism and horror."


# Transformers 

## Load pretrained

In [15]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
corpus_embeddings = model.encode(df.sentence.values, convert_to_tensor=True)
query_embedding = model.encode(query_sentence, convert_to_tensor=True)


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



## Predict

In [16]:
import torch

# We use cosine-similarity and torch.topk to find the highest 3 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=3)

print("\n\n======================\n\n")
print("Query:", query_sentence)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy() 
    idx = idx.cpu().data.numpy()
    display(df[['original_title', 'genres', 'sentence']].iloc[idx])  





Query: a crime story with a beautiful woman

Top 5 most similar sentences in corpus:


original_title                                                                                                                         Miss Bala
genres                                                                               [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]
sentence          The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.
Name: 18224, dtype: object

original_title                                                                                                                                                                                                                                                                                                                                 In the Cut
genres                                                                                                                                                                                                                                                                                  [{'id': 9648, 'name': 'Mystery'}, {'id': 53, 'name': 'Thriller'}]
sentence          Following the gruesome murder of a young woman in her neighborhood, a self-determined woman living in New York City--as if to test the limits of her own safety--propels herself into an impossibly risky sexual liaison. Soon she grows increasingly wary about the motives of every man with who

original_title                                                                                    The World of Suzie Wong
genres                                                    [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]
sentence          Story of the love between a struggling American artist and a beautiful Chinese prostitute in Hong Kong.
Name: 8018, dtype: object

# BERT with hugging face
We write a class to load the dataset, embed it in the object and use is to compute distance with embed query sentence.

In [76]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import numpy as np
from tqdm import tqdm


BERT_BATCH_SIZE = 4
MODEL_NAME = 'sentence-transformers/paraphrase-MiniLM-L6-v2'

class BertModel:
    def __init__(self, model_name, device=-1, small_memory=True, batch_size=BERT_BATCH_SIZE):
        self.model_name = model_name
        self._set_device(device)
        self.small_device = 'cpu' if small_memory else self.device
        self.batch_size = batch_size
        self.load_pretrained_model()

    def _set_device(self, device):
        if device == -1 or device == 'cpu':
            self.device = 'cpu'
        elif device == 'cuda' or device == 'gpu':
            self.device = 'cuda'
        elif isinstance(device, int) or isinstance(device, float):
            self.device = 'cuda'
        else:  # default
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")

    def load_pretrained_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        device = -1 if self.device == 'cpu' else 0
        self.pipeline = pipeline('feature-extraction',
                                 model=self.model, tokenizer=self.tokenizer, device=device)

    def embed(self, data):
        """ Create the embedded matrice from original sentences """
        nb_batchs = 1 if (len(data) < self.batch_size) else len(
            data) // self.batch_size
        batchs = np.array_split(data, nb_batchs)
        mean_pooled = []
        for batch in tqdm(batchs, total=len(batchs), desc='Training...'):
            mean_pooled.append(self.transform(batch))
        mean_pooled_tensor = torch.tensor(
            len(data), dtype=float).to(self.small_device)
        mean_pooled = torch.cat(mean_pooled, out=mean_pooled_tensor)
        self.embed_mat = mean_pooled

    @staticmethod
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(
            -1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def transform(self, data):
        if 'str' in data.__class__.__name__:
            data = [data]
        data = list(data)
        token_dict = self.tokenizer(
            data,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt")
        token_dict = self.to(token_dict, self.device)
        with torch.no_grad():
            token_embed = self.model(**token_dict)
        # each of the 512 token has a 768 or 384-d vector depends on model)
        attention_mask = token_dict['attention_mask']
        # average pooling of masked embeddings
        mean_pooled = self.mean_pooling(
            token_embed, attention_mask)
        mean_pooled = mean_pooled.to(self.small_device)
        return mean_pooled
    
    def to(self, data: dict, device: str):
        """Send all values to device by calling v.to(device)"""
        data = {k: v.to(device) for k, v in data.items()}
        return data

    def predict(self, in_sentence, topk=3):
        input_vec = self.transform(in_sentence)
        mat = cosine_similarity(input_vec, self.embed_mat)
        # best cos sim for each token independantly
        best_index = extract_best_indices(mat, topk=topk)
        return best_index

## Training

In [77]:
# CPU training
bert_model = BertModel(model_name=MODEL_NAME, batch_size=BERT_BATCH_SIZE)
bert_model.embed(df.sentence.values)

Training...: 100%|██████████████████████████| 5000/5000 [10:33<00:00,  7.90it/s]


In [None]:
# GPU training
bert_model_gpu = BertModel(model_name=MODEL_NAME, batch_size=BERT_BATCH_SIZE, device='cuda')
bert_model_gpu.transform(df.sentence.values)

## Prediction

In [11]:
query_sentence = 'the story of a waitress'
indices = bert_model_gpu.predict(query_sentence)
display(df[['original_title', 'genres', 'sentence']].iloc[indices])


dim (1, 20000)


Unnamed: 0,original_title,genres,sentence
14496,The Princess and the Frog,"[{'id': 10749, 'name': 'Romance'}, {'id': 10751, 'name': 'Family'}, {'id': 16, 'name': 'Animation'}, {'id': 10402, 'name': 'Music'}]","A waitress, desperate to fulfill her dreams as a restaurant owner, is set on a journey to turn a frog prince back into a human being, but she has to do face the same problem after she kisses him."
5873,WiseGirls,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]",A new waitress working at an Italian restaurant in New York City finds herself entangled in a mob-run underworld of drug dealing and murder.
11700,Fauteuils d'orchestre,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]","A young woman arrives in Paris where she finds a job as a waitress in bar next on Avenue Montaigne that caters to the surrounding theaters and the wealthy inhabitants of the area. She will meet a pianist, a famous actress and a great art collector, and become acquainted with the ""luxurious"" world her grandmother has told her about since her childhood."
