# Movie match

## imports

In [1]:
!pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


## Load data (kaggle) and some pre-processing

In [3]:
# load movie metadata
movie_meta=pd.read_csv("movies_metadata.csv")

  movie_meta=pd.read_csv("movies_metadata.csv")


In [4]:
movie_meta.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
# load ratings
ratings_data=pd.read_csv("ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [6]:
ratings_data=ratings_data.rename(columns={"movieId": "id"})

In [7]:
# combine ratings with movie metadata using movie id
movie_meta['id'] = movie_meta['id'].astype(str)
ratings_data['id'] = ratings_data['id'].astype(str)

movie_df=pd.merge(movie_meta,ratings_data, on='id')

In [8]:
movie_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'userId', 'rating', 'timestamp'],
      dtype='object')

In [9]:
movie_df.shape

(11437637, 27)

In [10]:
movie_df= movie_df.dropna()

In [11]:
movie_df=movie_df.drop_duplicates(subset='title', keep='first')

In [12]:
# well thats a huge shock! so many duplicates of a movie?
movie_df.shape

(209, 27)

## Get embeddings 

In [13]:
# Load embedding model
emb_model = SentenceTransformer("all-MiniLM-L6-v2")

temp=movie_df.copy()

# Precompute movie embeddings
temp["overview_embedding"] = temp["overview"].apply(lambda x: emb_model.encode(x, convert_to_tensor=True))
temp["tagline_embedding"] = temp["tagline"].apply(lambda x: emb_model.encode(x, convert_to_tensor=True))

In [14]:
temp.head(2)['overview']

2842     James Bond must unmask the mysterious head of ...
79364    Seth Gecko and his younger brother Richard are...
Name: overview, dtype: object

## Predict which movie would be best for the user

In [15]:
user_query = "I want an animated movie that kids will like for a birthday, it should be cute"

In [16]:
user_embedding = emb_model.encode(user_query, convert_to_tensor=True)

# Compute similarities
temp["similarity"] = temp["overview_embedding"].apply(lambda x: util.pytorch_cos_sim(x, user_embedding).item())

# Get top 3 matches
top_matches = temp.sort_values("similarity", ascending=False).head(3)
display(top_matches[["title", "overview", "rating"]].sort_values("rating",ascending=False).head(1))


Unnamed: 0,title,overview,rating
1384293,Fantasia,Walt Disney's timeless masterpiece is an extra...,3.0
