In [73]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
import kagglehub

path=kagglehub.dataset_download("shivamb/netflix-shows")

print(path)

/root/.cache/kagglehub/datasets/shivamb/netflix-shows/versions/5


In [75]:
data=pd.read_csv(path+"/netflix_titles.csv")
data=data[["title","description"]]
data.dropna()


Unnamed: 0,title,description
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,Ganglands,To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,In a city of coaching centers known to train I...
...,...,...
8802,Zodiac,"A political cartoonist, a crime reporter and a..."
8803,Zombie Dumb,"While living alone in a spooky town, a young g..."
8804,Zombieland,Looking to survive in a world taken over by zo...
8805,Zoom,"Dragged from civilian life, a former superhero..."


In [76]:
def preprocess(text):
  text = text.lower()
  token= word_tokenize(text)
  for i in token:
    if i in stopwords.words("english"):
      token.remove(i)
  return token

data['tokens']=data['description'].apply(preprocess)


In [77]:
data.head(10)

Unnamed: 0,title,description,tokens
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...","[her, father, nears, end, life, ,, filmmaker, ..."
1,Blood & Water,"After crossing paths at a party, a Cape Town t...","[crossing, paths, party, ,, cape, town, teen, ..."
2,Ganglands,To protect his family from a powerful drug lor...,"[protect, family, a, powerful, drug, lord, ,, ..."
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...","[feuds, ,, flirtations, toilet, talk, go, amon..."
4,Kota Factory,In a city of coaching centers known to train I...,"[a, city, coaching, centers, known, train, ind..."
5,Midnight Mass,The arrival of a charismatic young priest brin...,"[arrival, a, charismatic, young, priest, bring..."
6,My Little Pony: A New Generation,Equestria's divided. But a bright-eyed hero be...,"[equestria, 's, divided, ., a, bright-eyed, he..."
7,Sankofa,"On a photo shoot in Ghana, an American model s...","[a, photo, shoot, ghana, ,, american, model, s..."
8,The Great British Baking Show,A talented batch of amateur bakers face off in...,"[talented, batch, amateur, bakers, face, 10-we..."
9,The Starling,A woman adjusting to life after a loss contend...,"[woman, adjusting, life, loss, contends, a, fe..."


In [78]:
from gensim.models import Word2Vec
# Train Word2Vec model
w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)


In [79]:
import numpy as np

def get_average_vector(tokens,model):
  vectors = [model.wv[word] for word in tokens if word in model.wv]
  return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

data['vector'] = data['tokens'].apply(lambda x: get_average_vector(x, w2v_model))


In [80]:
data.head(5)

Unnamed: 0,title,description,tokens,vector
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...","[her, father, nears, end, life, ,, filmmaker, ...","[-0.54065865, 0.8447234, 0.18652605, -0.131015..."
1,Blood & Water,"After crossing paths at a party, a Cape Town t...","[crossing, paths, party, ,, cape, town, teen, ...","[-0.5610811, 0.87403524, 0.195207, -0.13334759..."
2,Ganglands,To protect his family from a powerful drug lor...,"[protect, family, a, powerful, drug, lord, ,, ...","[-0.60243845, 0.9467917, 0.20857409, -0.144842..."
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...","[feuds, ,, flirtations, toilet, talk, go, amon...","[-0.5354969, 0.8489674, 0.20005643, -0.1540569..."
4,Kota Factory,In a city of coaching centers known to train I...,"[a, city, coaching, centers, known, train, ind...","[-0.56461143, 0.8800056, 0.19456911, -0.136796..."


In [81]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_movies(input_movie, df, num_recommendations=5):
  input_movie = input_movie.lower()

  if input_movie not in df['title'].str.lower().values:
    print(f"Movie '{input_movie}' not found in the dataset.")
    return

  input_vector = df[df['title'].str.lower() == input_movie]['vector'].values[0]

  df['similarity']=df['vector'].apply(lambda x : cosine_similarity([input_vector], [x])[0][0])

  # Sort by similarity and exclude the input movie itself
  recommendations = df[df['title'].str.lower() != input_movie].sort_values(by='similarity', ascending=False)

  return recommendations[['title','description','similarity']].head(num_recommendations)





In [84]:
input_movie='Never Back Down'
recommendations = recommend_movies(input_movie, data)
print(recommendations)

               title                                        description  \
5980   1 Mile to You  After escaping the bus accident that killed hi...   
218   Titletown High  In a Georgia town where football rules and win...   
1356      My Dead Ex  High school teens Charley and Ben find themsel...   
6662          Easy A  When a lie about Olive's reputation transforms...   
6392         Burning  An aspiring writer goes to the airport to pick...   

      similarity  
5980    0.999979  
218     0.999978  
1356    0.999976  
6662    0.999974  
6392    0.999974  
