# Netflix popular movies recommendation
- Dataset from kaggle https://www.kaggle.com/datasets/narayan63/netflix-popular-movies-dataset
- only 200 row for easy use

## Data Loading

In [1]:
import pandas as pd

# Load data from fixed movies.csv
df = pd.read_csv("movies.csv")
df.head(10)

Unnamed: 0,title,description
0,Cobra Kai,Decades after their 1984 All Valley Karate Tou...
1,The Crown,Follows the political rivalries and romance of...
2,Better Call Saul,The trials and tribulations of criminal lawyer...
3,Devil in Ohio,When a psychiatrist shelters a mysterious cult...
4,Cyberpunk: Edgerunners,A Street Kid trying to survive in a technology...
5,The Sandman,Upon escaping after decades of imprisonment by...
6,Rick and Morty,An animated series that follows the exploits o...
7,Breaking Bad,A high school chemistry teacher diagnosed with...
8,The Imperfects,After an experimental gene therapy turns them ...
9,Blonde,A fictionalized chronicle of the inner life of...


## Sentence Embedding
- Based on the description of movies

In [2]:
from sentence_transformers import SentenceTransformer

# embedd description of movies
sentences = list(df["description"])

model = SentenceTransformer('thenlper/gte-small')
embeddings = model.encode(sentences)

print("Embedding Shapes", embeddings.shape)

  from .autonotebook import tqdm as notebook_tqdm


Embedding Shapes (200, 384)


## FAISS to create indexes
- use faiss indexing method to train for faster searching

In [3]:
import faiss
import numpy as np

d = embeddings.shape[1]
nlist = int(np.sqrt(embeddings.shape[0])) # set square root of the dataset 
quantizer = faiss.IndexFlatL2(d)  # the quantizer
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train the index
index.train(embeddings)
index.add(embeddings)

## Query Data and show ranking

In [4]:
# write your query here to see recommendations
# you can change query1-2 with query to search for a recommendation
query1 = "I want a movie that girl is the main charater"
query2 = "I like romance movies"
query = "South Park"

# embedd query text 
query_embedding = model.encode(query)
query_embedding = query_embedding.reshape(1, -1)

In [5]:
from time import time

k = 5  # number of nearest contents

start = time()
# search for top 5 similarity movies
Distance, Index_results = index.search(query_embedding, k)
end = time()
time_searching = end - start
print("Searching time: %fs" %(time_searching))

movies_title = list(df["title"])
result_movies_name = [movies_title[i] for i in Index_results.tolist()[0]]

result_movies_name

Searching time: 0.001015s


['South Park',
 'Seinfeld',
 'Rick and Morty',
 'I Used to Be Famous',
 "That '70s Show"]

In [6]:
Distance_list = Distance.tolist()[0]

result_table = pd.DataFrame(columns=["Ranking", "Title", "L2 distance"])
result_table["Ranking"] = [n+1 for n in range(len(Distance_list))]
result_table["Title"] = result_movies_name
result_table["L2 distance"] = Distance_list

In [7]:
from IPython.display import display, Markdown

display(Markdown("## Ranking of Recommendation"))
display(result_table)

## Ranking of Recommendation

Unnamed: 0,Ranking,Title,L2 distance
0,1,South Park,0.185832
1,2,Seinfeld,0.34197
2,3,Rick and Morty,0.348036
3,4,I Used to Be Famous,0.368147
4,5,That '70s Show,0.380745
