In [None]:
import pandas as pd
import numpy as np
import faiss
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

data = pd.read_csv("../Data/netflix_data_with_links")

In [None]:
data['type_encoded'] = data['type'].map({'Movie':1,'TV Show':0})

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['rating_encoded'] = le.fit_transform(data['rating'].astype(str))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

data['country'] = data['country'].fillna('Unknown')
data['country_split'] = data['country'].str.split(',')

mlb = MultiLabelBinarizer()
country_encoded = mlb.fit_transform(data['country_split'])

country_df = pd.DataFrame(country_encoded, columns = mlb.classes_)
data = pd.concat([data,country_df], axis = 1)

In [None]:
data['listed_in'] = data['listed_in'].fillna('Unknown')
data['genre_split'] = data['listed_in'].str.split(',')
mlb_genre = MultiLabelBinarizer()
genre_encoded = mlb_genre.fit_transform(data['genre_split'])

In [None]:
def extract_minutes(x):
    try:
        if 'min' in str(x):
            return int(x.split()[0])
        else:
            return 0
    except :
        return 0
data['duration_minutes'] = data['duration'].fillna('0 min').apply(extract_minutes)

In [None]:
data['release_year'] = data['release_year'].fillna(0).astype(int)
data['release_month'] = pd.to_datetime(data['date_added'], errors = 'coerce').dt.month.fillna(0).astype(int)
data['release_decade'] = (data['release_year'] //10)*10

In [42]:
data['genre_count'] = data['genre_split'].apply(len)

In [None]:
data['country'] = data['country'].fillna('Unknown')
data['country_split'] = data['country'].str.split(',')
country_counts = data['country_split'].explode().value_counts()
top_countries = country_counts[country_counts > 50].index 

def group_countries(countries):
    return [c if c in top_countries else 'Other' for c in countries]

data['country_split'] = data['country_split'].apply(group_countries)

genre_counts = data['genre_split'].explode().value_counts()
top_genres = genre_counts[genre_counts > 50].index

def group_genres(genres):
    return [g if g in top_genres else 'Other' for g in genres]

data['genre_split'] = data['genre_split'].apply(group_genres)


In [44]:
# # # Movie Suggestion
# stop_words = set(stopwords.words('english'))

# def clean_text(text):
#     text = str(text).lower()
#     text = re.sub(r'[^a-z0-9\s]', '', text)
#     text = ' '.join([word for word in text.split() if word not in stop_words])
#     return text

# data['combined'] = (
#     (data['listed_in'].fillna('') + ' ') * 3 +
#     (data['description'].fillna('') + ' ') * 2 +
#     data['director'].fillna('') + ' ' +
#     data['cast'].fillna('') + ' ' +
#     data['country'].fillna('') + ' ' +
#     data['rating'].fillna('')
# ).apply(clean_text)


# import numpy as np
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = model.encode(data['combined'], show_progress_bar=True, convert_to_numpy=True)

# # Save for future use
# np.save("embeddings.npy", embeddings)

In [45]:
import numpy as np

embeddings = np.load("../model/embeddings.npy")


import faiss
import numpy as np
import pickle

# embeddings = your numpy array of shape (n_samples, embedding_dim)
faiss_index = faiss.IndexFlatIP(embeddings.shape[1])
faiss_index.add(embeddings)
with open("faiss_index.pkl", "wb") as f:
    pickle.dump(faiss_index, f)


In [46]:

embeddings = np.load("embeddings.npy")
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  
index.add(embeddings)


In [47]:
def recommend_with_percentage(title, top_n=10, type_filter=None):
    title_lower = title.lower().strip()
    if title_lower not in data['title'].str.lower().values:
        print("Movie not found")
        return []

    idx = data[data['title'].str.lower() == title_lower].index[0]
    vec = embeddings[idx].reshape(1, -1)
    faiss.normalize_L2(vec)  # normalize the query vector

    # Apply type filter
    if type_filter:
        filtered_idx = data[data['type'] == type_filter].index.to_numpy()
        filtered_embeddings = embeddings[filtered_idx]
        faiss.normalize_L2(filtered_embeddings)

        index_temp = faiss.IndexFlatIP(filtered_embeddings.shape[1])
        index_temp.add(filtered_embeddings)
        D, I = index_temp.search(vec, min(top_n, len(filtered_embeddings)))
        top_indices = filtered_idx[I[0]]
        top_scores = D[0]
    else:
        D, I = index.search(vec, min(top_n + 1, len(data)))  # +1 to skip itself
        top_indices = I[0][1:top_n+1] if len(I[0]) > 1 else []
        top_scores = D[0][1:top_n+1] if len(D[0]) > 1 else []

    recommended = [(data.iloc[i]['title'], round(score * 100, 1)) 
                   for i, score in zip(top_indices, top_scores)]
    return recommended


In [49]:
recommend_with_percentage('blood & water')

[('Kissing Game', np.float32(82.6)),
 ('Into the Night', np.float32(81.9)),
 ('Open Your Eyes', np.float32(81.1)),
 ('The Rain', np.float32(80.2)),
 ('Equinox', np.float32(79.4)),
 ('To the Lake', np.float32(79.2)),
 ('The Platform', np.float32(79.1)),
 ('Jinn', np.float32(78.8)),
 ('Good Morning, Ver√¥nica', np.float32(78.4)),
 ('Riverdale', np.float32(78.4))]