In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('netflix_titles.csv', encoding='latin1')
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,...,,,,,,,,,,
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,...,,,,,,,,,,
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,...,,,,,,,,,,
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,...,,,,,,,,,,
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,...,,,,,,,,,,
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,...,,,,,,,,,,
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,...,,,,,,,,,,
8807,s8808,TV Show,Parasyte: The Grey,Yeon Sang-ho,"Shin Hyun-been, Jeon Yeo-bin, Goo Kyo-hwan",South Korea,"April 5, 2024",2024,TV-MA,1 Season,...,,,,,,,,,,


In [3]:
df = df[['show_id', 'type', 'title', 'rating']].fillna(0)

In [4]:
df.isna().sum()

show_id    0
type       0
title      0
rating     0
dtype: int64

In [5]:
df

Unnamed: 0,show_id,type,title,rating
0,s1,Movie,Dick Johnson Is Dead,PG-13
1,s2,TV Show,Blood & Water,TV-MA
2,s3,TV Show,Ganglands,TV-MA
3,s4,TV Show,Jailbirds New Orleans,TV-MA
4,s5,TV Show,Kota Factory,TV-MA
...,...,...,...,...
8804,s8805,Movie,Zombieland,R
8805,s8806,Movie,Zoom,PG
8806,s8807,Movie,Zubaan,TV-14
8807,s8808,TV Show,Parasyte: The Grey,TV-MA


In [6]:
import random

user_ids = [f'u{i}' for i in range(1, 6)]

In [7]:
interaction_rows = []

for user in user_ids:
    
    watched = df.sample(random.randint(5, 10))  
    for _, row in watched.iterrows():
        interaction_rows.append({
            'user_id': user,
            'show_id': row['show_id'],
            'type': row['type'],
            'title': row['title'],
            'rating': row['rating'],
            'rating_given': random.randint(3, 5)  
        })

In [8]:
interaction_df = pd.DataFrame(interaction_rows)
print(f"Number of interaction rows: {len(interaction_df)}")
print(interaction_df.head())

Number of interaction rows: 35
  user_id show_id     type              title rating  rating_given
0      u1   s8444    Movie     The One I Love      R             5
1      u1    s853    Movie  99 Songs (Telugu)  TV-14             5
2      u1   s3329  TV Show           Ad Vitam  TV-MA             5
3      u1   s1655  TV Show          Before 30  TV-MA             3
4      u1   s3959  TV Show           Ultraman  TV-14             3


In [9]:
df1 = interaction_df

In [10]:
df1.head()

Unnamed: 0,user_id,show_id,type,title,rating,rating_given
0,u1,s8444,Movie,The One I Love,R,5
1,u1,s853,Movie,99 Songs (Telugu),TV-14,5
2,u1,s3329,TV Show,Ad Vitam,TV-MA,5
3,u1,s1655,TV Show,Before 30,TV-MA,3
4,u1,s3959,TV Show,Ultraman,TV-14,3


In [11]:
num_rows = len(df)
print(f"Number of rows: {num_rows}")


Number of rows: 8809


In [None]:
user_items_matrix = df1.pivot_table(
    index='user_id', 
    columns='show_id', 
    values='rating_given',
    fill_value=0  # Fill missing interactions with 0
)

# Transpose to get Item-User matrix (rows=items, columns=users)
item_user_matrix = user_items_matrix.T  # Key step for item-item CF
print(item_user_matrix.head())

user_id   u1   u2   u3   u4   u5
show_id                         
s1332    0.0  0.0  0.0  4.0  0.0
s1516    0.0  0.0  0.0  0.0  3.0
s157     0.0  0.0  0.0  5.0  0.0
s1655    3.0  0.0  0.0  0.0  0.0
s1850    0.0  0.0  0.0  0.0  5.0


In [13]:
user_items_matrix = user_items_matrix.fillna(0)
user_items_matrix.head()

show_id,s1332,s1516,s157,s1655,s1850,s1946,s1968,s2633,s2787,s2916,...,s7164,s7621,s7637,s7943,s8057,s822,s8444,s8515,s853,s8651
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u1,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0
u2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,4.0,4.0,0.0,0.0,0.0,0.0
u3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
u4,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
u5,0.0,3.0,0.0,0.0,5.0,5.0,5.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# def find_similar_user(user_id, user_item_matrix, n_users = 5):
#     similarities = cosine_similarity(user_item_matrix)
#     similarities_df = pd.DataFrame(similarities, index=user_item_matrix.index, columns=user_item_matrix.index)

#     similar_users = similarities_df[user_id].sort_values(ascending=False).index[1:n_users + 1]
#     return similar_users

In [None]:
def find_similar_items(item_id, item_similarities_df, n_items=5):
    """
    Finds items similar to a target item based on precomputed item-item cosine similarities.
    
    Args:
        item_id (str): Target item ID (e.g., 's8444').
        item_similarities_df (pd.DataFrame): Precomputed item-item similarity matrix.
        n_items (int): Number of similar items to return.
    
    Returns:
        pd.Index: List of similar item IDs.
    """
    if item_id not in item_similarities_df.index:
        raise ValueError(f"Item ID {item_id} not found in the similarity matrix.")
    
    # Get top-N most similar items (excluding self)
    similar_items = item_similarities_df[item_id].sort_values(ascending=False).index[1:n_items + 1]
    return similar_items

In [23]:
# Example: Find items similar to 's8444' (The One I Love)
similar_items = find_similar_items('s8444', item_similarities_df, n_items=5)
print(similar_items)

NameError: name 'find_similar_items' is not defined

In [16]:
def recommend_movies(user_id, user_items_matrix, n_recom=5):

    similar_users = find_similar_user(user_id, user_items_matrix)
    

    user_ratings = user_items_matrix.loc[user_id]
    unseen_movies = user_ratings[user_ratings == 0].index  
    
   
    similar_users_ratings = user_items_matrix.loc[similar_users, unseen_movies]
    recommended_scores = similar_users_ratings.mean(axis=0)
    

    return recommended_scores.sort_values(ascending=False).head(n_recom)


In [17]:
recommendations = recommend_movies('u1', user_items_matrix, n_recom=5)
print(recommendations)


show_id
s157     1.25
s1946    1.25
s1850    1.25
s2916    1.25
s2633    1.25
dtype: float64


In [18]:
# df1[df1['show_id'] == 's2595']

In [19]:
import joblib


joblib.dump(user_items_matrix, 'user_items_matrix.pkl')
interaction_df.to_csv('user_interactions.csv', index=False)

In [20]:
similarities = cosine_similarity(user_items_matrix)
joblib.dump(similarities, 'similarity_matrix.pkl')

['similarity_matrix.pkl']

In [21]:
user_items_matrix = joblib.load('user_items_matrix.pkl')

similarities = joblib.load('similarity_matrix.pkl')

df1 = pd.read_csv("user_interactions.csv")
