### Installing and importing libraries, uploading data
We will start by installing sentence-transformers which is used for text similarity.

In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


As the data file is uploaded from local drive, we will using below code to upload the csv file from local system to colab. Some of the pre-processing steps are applied to the file downloaded from Kaggle. Those steps can be seen in the other code file uploaded.

In [2]:
from google.colab import files
upload = files.upload()

Saving movies.csv to movies (1).csv


In [3]:
import pandas as pd
import re
import io
 
df = pd.read_csv(io.BytesIO(upload['movies.csv']))

In [4]:
# uploading the specific model we will be using for text similarity
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
# displaying data
df.head()

Unnamed: 0,genres,overview,spoken_languages,title,Animation,Comedy,Family,Adventure,Fantasy,Romance,...,Horror,History,ScienceFiction,Mystery,War,Foreign,Music,Documentary,Western,TVMovie
0,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...",['English'],toy story,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"['English', 'Franais']",jumanji,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,['English'],grumpier old men,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...",['English'],waiting to exhale,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,['Comedy'],Just when George Banks has recovered from his ...,['English'],father of the bride part ii,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Defining functions

In [6]:
# defining function to get the word embedding for given description
def get_embedding(desc):
  return model.encode(desc) 

In [7]:
# defining function to get cosine similarity between two texts
def get_cosine_similarity(desc, input_desc):
  similarity = util.pytorch_cos_sim(desc, input_desc)
  if similarity >=0.5:
    return similarity
  else:
    return 0

In [8]:
def get_similar_movies(movie_name):
  # filtering on similar genres
  filter = df.loc[df['title']==movie_name.lower(), 'genres'][0].split(',')

  # since genres are in string format, using regex to clear punctuations
  df_temp = df.loc[df[re.sub(r'[^a-zA-Z]', '', filter[0])]==1, ('title', 'overview', re.sub(r'[^a-zA-Z]', '', filter[0]))]

  # appending data of other genres resembling selected movie
  if len(filter)>1:
    for item in filter[1:]:
      df_temp = df_temp.merge(df.loc[df[re.sub(r'[^a-zA-Z]', '', item)]==1, ('title', 'overview', re.sub(r'[^a-zA-Z]', '', item))], on=["title", "overview"], how="outer") #how="outer", 

  # dropping null values in 'overview' column if any
  df_temp.dropna(subset=['overview'], inplace=True)

  # filling null values for genres with 0
  df_temp.fillna(0, inplace = True)

  # dropping the input movie name
  df_temp.drop(df_temp[df_temp.title==movie_name.lower()].index, inplace=True)

  return df_temp

In [9]:
def get_input_embedding(movie_name):
  # getting overview of input movie
  input_overview = df.loc[df['title']==movie_name.lower(),'overview']

  # embedding overview of input movie
  input_embedding = get_embedding(input_overview)

  return input_embedding

In [10]:
# getting embedding and similarities
def calculate_similarity(movies, movie_name):
  
  input_embedding = get_input_embedding(movie_name)

  # getting word embeddings for all the similar movies
  movies['overview_embedding'] = movies['overview'].apply(get_embedding)

  # getting cosine similarity of similar movies with input movie overview
  movies['similarity'] = movies['overview_embedding'].apply(get_cosine_similarity, input_desc=input_embedding)

  return movies

In [11]:
def get_scores(movie_name):
  # get the dataframe with similar movies
  movies = get_similar_movies(movie_name)

  #get dataframe with embedding and cosine similarity calculated
  movies = calculate_similarity(movies, movie_name)

  cols = movies.columns.tolist()
  for name in ['title', 'overview', 'overview_embedding', 'similarity']:
    cols.remove(name)

  #calculating sum of similar genres
  movies['sum_genres'] = movies[cols[0]]

  if len(cols)>1:
    for i in range(1,len(cols)):
      movies['sum_genres'] = movies['sum_genres'] + movies[cols[i]]

  # calculating weighted score of genre similarity and description similarity
  movies['score'] = movies['sum_genres']*0.6 + movies['similarity']*0.4

  # sorting dataframe based on weighted score
  movies = movies.sort_values(by='score', ascending=False)

  return movies

In [12]:
movie_name = input("Enter name of a movie: ")

if movie_name.lower() in df['title'].tolist():
  movies = get_scores(movie_name)
  print("\nTop 10 Similar movies are:")
  for name in movies['title'].head(10).tolist():
    print("\t",name.title())
else:
  print("Cannot find this movie in database! Please try again")

Enter name of a movie: Toy story

Top 10 Similar movies are:
	 Toy Story 3
	 Toy Story 2
	 Despicable Me 2
	 Tri Bogatyrya I Shamakhanskaya Tsaritsa
	 Scary Godmother: Halloween Spooktakular
	 Dexter'S Laboratory: Ego Trip
	 Frankenweenie
	 Hotel Transylvania
	 One Froggy Evening
	 Sing
