In [39]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display
from threading import Timer


In [40]:

# Load datasets
# ratings = pd.read_csv("Netflix_Dataset_Rating.csv")
movies = pd.read_csv("Netflix_Dataset_Movie.csv")

In [41]:


# Write to a Parquet file
# ratings.to_parquet("ratings.parquet", index=False)

In [42]:
ratings=pd.read_parquet("ratings.parquet")

In [43]:


# Function to clean movie titles
# def clean_title(Name):
#     return re.sub("[^a-zA-Z0-9]", " ",Name.lower())
def clean_title(Name):
    return re.sub("[^a-zA-Z0-9 ]", "", Name).lower().strip()


# Apply title cleaning
movies["clean_title"] = movies["Name"].apply(clean_title)


In [44]:
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')


In [45]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf = vectorizer.fit_transform(movies["clean_title"])


In [46]:
# Create Term Frequency matrix using TF-IDF
# vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# tfidf = vectorizer.fit_transform(movies["clean_title"])


In [47]:


# def search(Name):
#     Name=clean_title(Name)
#     query_vec=vectorizer.transform([Name])
#     similarity=cosine_similarity(query_vec,tfidf).flatten()
#     indices=np.argpartition(similarity,-5)[-5:]
#     results=movies.iloc[indices][::-1]
#     return results


In [48]:
def search(Name):
    Name = clean_title(Name)
    query_vec = vectorizer.transform([Name])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argsort(similarity)[::-1][:5]  # Sort in descending order
    results = movies.iloc[indices]
    if results.empty:
        return pd.DataFrame(columns=["Movie_ID", "Year", "Name", "clean_title"])
    return results




In [49]:
# def search(Name):
#     Name = clean_title(Name)
#     print(f"Searching for: {Name}")
#     query_vec = vectorizer.transform([Name])
#     similarity = cosine_similarity(query_vec, tfidf).flatten()
#     print(f"Similarity scores: {similarity}")
#     indices = np.argsort(similarity)[::-1][:5]
#     print(f"Top indices: {indices}")
#     results = movies.iloc[indices]
#     print(f"Results:\n{results}")
#     if results.empty:
#         return pd.DataFrame(columns=["Movie_ID", "Year", "Name", "clean_title"])
#     return results


In [50]:
# data = pd.merge(ratings, movies, on="Movie_ID")

In [51]:
movie_input = widgets.Text(
    value="Character",
    description="Movie Title: ",
    disabled=False
)
movie_list = widgets.Output()

In [52]:
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            display(results)

In [53]:
movie_input.observe(on_type, names='value')
# display(movie_input, movie_list)


In [54]:
# import time

# class Debounce:
#     def __init__(self, delay):
#         self.delay = delay
#         self.timer = None
    
#     def debounce(self, func):
#         def wrapper(*args, **kwargs):
#             if self.timer is not None:
#                 self.timer.cancel()
#             self.timer = time.Timer(self.delay, lambda: func(*args, **kwargs))
#             self.timer.start()
#         return wrapper

# debounce = Debounce(0.5).debounce

# @debounce
# def on_type(data):
#     with movie_list:
#         movie_list.clear_output()
#         title = data["new"]
#         if len(title) > 5:
#             results = search(title)
#             display(results)

# movie_input.observe(on_type, names='value')


In [55]:
from threading import Timer

class Debouncer:
    def __init__(self, delay):
        self.delay = delay
        self.timer = None
    
    def debounce(self, func):
        def wrapper(*args, **kwargs):
            if self.timer is not None:
                self.timer.cancel()
            self.timer = Timer(self.delay, lambda: func(*args, **kwargs))
            self.timer.start()
        return wrapper

debouncer = Debouncer(0.5).debounce

@debouncer
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            display(results)


In [56]:
movie_id=3

In [57]:
similar_users=ratings[(ratings["Movie_ID"]== movie_id) & (ratings["Rating"] >3)]["User_ID"].unique()
similar_users_recs=ratings[(ratings["User_ID"].isin(similar_users)) & (ratings["Rating"] >3)]["Movie_ID"]

In [58]:
similar_users_recs=similar_users_recs.value_counts() / len(similar_users)
similar_users_recs=similar_users_recs[similar_users_recs > 0.1]


In [59]:
similar_users_recs

Movie_ID
3       1.000000
571     0.667389
2862    0.602384
2452    0.587216
3371    0.576381
          ...   
1395    0.101842
2152    0.101842
468     0.100758
2149    0.100758
989     0.100758
Name: count, Length: 265, dtype: float64

In [60]:
all_users = ratings[(ratings["Movie_ID"].isin(similar_users_recs.index)) & (ratings["Rating"] > 3)]
all_users_recs=all_users["Movie_ID"].value_counts()/len(all_users["User_ID"].unique())

In [61]:
all_users_recs

Movie_ID
1905    0.643211
2452    0.618919
4306    0.615726
3962    0.573403
2862    0.570216
          ...   
3404    0.008827
895     0.008534
4450    0.008388
3410    0.006763
3       0.006436
Name: count, Length: 265, dtype: float64

In [62]:
rec_percentages=pd.concat([similar_users_recs,all_users_recs], axis=1)
rec_percentages.columns=["similar","all"]

In [63]:
rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]
rec_percentages=rec_percentages.sort_values("score",ascending=False)

In [64]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,1.000000,0.006436,155.385699
4450,0.187432,0.008388,22.345574
3301,0.192849,0.009273,20.795981
3410,0.106176,0.006763,15.698761
2350,0.134345,0.009329,14.400468
...,...,...,...
3825,0.106176,0.229492,0.462654
1428,0.117010,0.253059,0.462381
3860,0.120260,0.317631,0.378615
1962,0.138678,0.368670,0.376158


In [65]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["Movie_ID"] == movie_id) & (ratings["Rating"] > 3)]["User_ID"].unique()
    similar_users_recs = ratings[(ratings["User_ID"].isin(similar_users)) & (ratings["Rating"] >3)]["Movie_ID"]
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > 0.1]
    all_users = ratings[(ratings["Movie_ID"].isin(similar_users_recs.index)) & (ratings["Rating"] >3)]
    all_users_recs = all_users["Movie_ID"].value_counts() / len(all_users["User_ID"].unique())
    rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_on="Movie_ID", right_on="Movie_ID")[["Movie_ID", "Year", "Name"]]


In [66]:
movie_name_input=widgets.Text(
    value="",
    description="Movie Title",
    disabled=False
)



In [67]:
recommendation_list=widgets.Output()

In [68]:
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        Name=data["new"]
        if len(Name)>5:
            results=search(Name)
            movie_id=results.iloc[0]["Movie_ID"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")
# display(movie_name_input,recommendation_list)




In [69]:
movie=input("Enter the movie name: ")

In [70]:
search(movie)


Unnamed: 0,Movie_ID,Year,Name,clean_title
7464,7465,2004,The Life,the life
10395,10396,1999,After Life,after life
15786,15787,2003,My Life Without Me,my life without me
13219,13220,2000,Once in the Life,once in the life
14592,14593,1993,My Life,my life
