In [None]:
!pip install kaggle



In [None]:
# uploading dataset from kaggle

import os
from google.colab import files

uploaded = files.upload()

# Make directory for kaggle configuration and move the API key into it
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename('kaggle.json', '/root/.kaggle/kaggle.json')

# File permissions
os.chmod('/root/.kaggle/kaggle.json', 600)

Saving kaggle.json to kaggle.json


In [None]:
# Pull the dataset from Kaggle
!kaggle datasets download -d shivamb/netflix-shows

Dataset URL: https://www.kaggle.com/datasets/shivamb/netflix-shows
License(s): CC0-1.0


In [None]:
!unzip /content/netflix-shows.zip -d /content/

Archive:  /content/netflix-shows.zip
  inflating: /content/netflix_titles.csv  


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df.shape

(8807, 12)

In [None]:
# Understanding data

# Check for missing values
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [None]:
# Handing missing cast & director
df['cast'] = df['cast'].fillna('Unknown')
df['director'] = df['director'].fillna('Unknown')

In [None]:
# Remove extra spaces from the 'listed_in' (genres) and 'cast' columns
df['genre'] = df['listed_in'].str.replace(' ', '', regex=True)
df['new_cast'] = df['cast'].str.replace(' ', '', regex=True)
df['new_director'] = df['director'].str.replace(' ', '', regex=True)

In [None]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,genre,new_cast,new_director
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Documentaries,Unknown,KirstenJohnson
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","InternationalTVShows,TVDramas,TVMysteries","AmaQamata,KhosiNgema,GailMabalane,ThabangMolab...",Unknown
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,"CrimeTVShows,InternationalTVShows,TVAction&Adv...","SamiBouajila,TracyGotoas,SamuelJouy,NabihaAkka...",JulienLeclercq
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Docuseries,RealityTV",Unknown,Unknown
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,"InternationalTVShows,RomanticTVShows,TVComedies","MayurMore,JitendraKumar,RanjanRaj,AlamKhan,Ahs...",Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...","CultMovies,Dramas,Thrillers","MarkRuffalo,JakeGyllenhaal,RobertDowneyJr.,Ant...",DavidFincher
8803,s8804,TV Show,Zombie Dumb,Unknown,Unknown,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...","Kids'TV,KoreanTVShows,TVComedies",Unknown,Unknown
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,"Comedies,HorrorMovies","JesseEisenberg,WoodyHarrelson,EmmaStone,Abigai...",RubenFleischer
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...","Children&FamilyMovies,Comedies","TimAllen,CourteneyCox,ChevyChase,KateMara,Ryan...",PeterHewitt


In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model (you can try 'all-MiniLM-L6-v2' for better performance)
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Generate separate embeddings for each feature
genre_embeddings = model.encode(df['genre'].tolist(), convert_to_tensor=True)
description_embeddings = model.encode(df['description'].tolist(), convert_to_tensor=True)
cast_embeddings = model.encode(df['new_cast'].tolist(), convert_to_tensor=True)
title_embeddings = model.encode(df['title'].tolist(), convert_to_tensor=True)
director_embeddings = model.encode(df['new_director'].tolist(), convert_to_tensor=True)

In [None]:
import pickle

# Create a dictionary with all embeddings
embeddings_dict = {
    'genre': genre_embeddings,
    'description': description_embeddings,
    'cast': cast_embeddings,
    'title': title_embeddings,
    'director': director_embeddings
}

# Save the dictionary to a single file
with open('/content/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)

In [None]:
# Load the embeddings dictionary from the Pickle file
with open('/content/embeddings.pkl', 'rb') as f:
    embeddings_dict = pickle.load(f)

# Access individual embeddings
genre_embeddings = embeddings_dict['genre']
description_embeddings = embeddings_dict['description']
cast_embeddings = embeddings_dict['cast']
title_embeddings = embeddings_dict['title']
director_embeddings = embeddings_dict['director']

In [None]:
import torch

# Weighting factors
genre_weight = 0.4
description_weight = 0.4
title_weight = 0.1
cast_weight = 0.05
director_weight = 0.05

# Combine embeddings by weighted sum
combined_embeddings = (
    genre_embeddings * genre_weight +
    description_embeddings * description_weight +
    cast_embeddings * cast_weight
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_similar_movies_by_name(movie_name, df, combined_embeddings):
    # Find the index of the movie based on the title
    movie_index = df[df['title'] == movie_name].index[0]

    # Calculate the cosine similarity matrix between all combined movie embeddings
    similarity_matrix = cosine_similarity(combined_embeddings.cpu().detach().numpy())

    # Find the most similar movies
    similar_movie_indices = similarity_matrix[movie_index].argsort()[-6:-1][::-1]  # Sort in descending order

    # Print the recommended similar movies with their details and similarity percentages
    print(f"Movies similar to '{df['title'][movie_index]}':")
    for idx in similar_movie_indices:
        movie = df.iloc[idx]
        similarity_percentage = similarity_matrix[movie_index][idx] * 100  # Convert similarity to percentage

        # Print the movie details and similarity percentage
        print(f"\nTitle: {movie['title']}")
        print(f"Director: {movie['director']}")
        print(f"Cast: {movie['cast']}")
        print(f"Genres: {movie['listed_in']}")
        print(f"Rating: {movie['rating']}")
        print(f"Description: {movie['description']}")
        print(f"Similarity: {similarity_percentage:.2f}%")

# Example usage:
recommend_similar_movies_by_name("Kota Factory", df, combined_embeddings)


Movies similar to 'Kota Factory':

Title: Rishta.com
Director: Unknown
Cast: Shruti Seth, Kavi Shastri, Siddhant Karnick, Kavin Dave
Genres: International TV Shows, Romantic TV Shows, TV Comedies
Rating: TV-14
Description: Partners at an Indian matrimonial agency face endlessly challenging and often impossible demands as they help clients make the perfect match.
Similarity: 74.14%

Title: Meteor Garden
Director: Unknown
Cast: Shen Yue, Dylan Wang, Darren Chen, Connor Liang, Caesar Wu
Genres: International TV Shows, Romantic TV Shows, TV Dramas
Rating: TV-14
Description: Dong Shancai is determined to excel at her dream university, where she encounters an elite clique of dashing, popular high-achievers – and finds love.
Similarity: 73.70%

Title: Somewhere Only We Know
Director: Unknown
Cast: Zhang Chao, Li Tingting, Chen Bohao, Sun Jialing, Zhang Zhang, Li Jiuxiao, Deng Yuli, Li Huan, Fang Wenqiang, Gong Rui
Genres: International TV Shows, Romantic TV Shows, TV Comedies
Rating: TV-MA
De