In [1]:
import pandas as pd
import numpy as np

In [23]:
# from sklearn.metrics.pairwise import cosine_similarity
movies = pd.read_csv("dataset/movies.csv")

In [8]:

ratings = pd.read_csv("dataset/ratings.csv")
tags = pd.read_csv("dataset/tags.csv")

In [3]:
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [4]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,2020-03-01
1,22,79592,misogyny,2020-02-12
2,22,247150,acrophobia,2021-05-31
3,34,2174,music,2009-08-09
4,34,2174,weird,2009-08-09


In [4]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [7]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,1999-12-03
1,1,25,1.0,1999-12-03
2,1,29,2.0,1999-11-22
3,1,30,5.0,1999-12-03
4,1,32,5.0,1999-11-22


In [7]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy


In [10]:
tags['tag'].value_counts()

tag
sci-fi                 10996
atmospheric             9589
action                  8473
comedy                  8139
funny                   7467
                       ...  
eyeless woman              1
dwarf woman                1
biid                       1
disgusting pig ears        1
Maxim Gorky                1
Name: count, Length: 140979, dtype: int64

In [11]:
print(tags.columns)
print(ratings.columns)
print(movies.columns)

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Index(['movieId', 'title', 'genres'], dtype='object')


In [8]:
finalDf = ratings.merge(movies, on='movieId')

In [28]:
finalDf.drop(['timestamp','genres'],inplace=True,axis=1)

KeyError: "['timestamp', 'genres'] not found in axis"

In [14]:
finalDf.head()

Unnamed: 0,userId,movieId,rating,title
0,1,17,4.0,Sense and Sensibility (1995)
1,1,25,1.0,Leaving Las Vegas (1995)
2,1,29,2.0,"City of Lost Children, The (Cité des enfants p..."
3,1,30,5.0,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
4,1,32,5.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)


In [29]:
finalDf.to_csv('final_df.csv', index=False)

In [15]:
# Compute average rating & vote count per movie
movie_stats = finalDf.groupby(['movieId', 'title']).agg(
    v = ('rating','count'),
    R = ('rating','mean')
).reset_index()

In [16]:
movie_stats.head()

Unnamed: 0,movieId,title,v,R
0,1,Toy Story (1995),68997,3.897438
1,2,Jumanji (1995),28904,3.275758
2,3,Grumpier Old Men (1995),13134,3.139447
3,4,Waiting to Exhale (1995),2806,2.845331
4,5,Father of the Bride Part II (1995),13154,3.059602


In [17]:
# Global mean rating across all movies
C = movie_stats['R'].mean()

# Minimum votes required to be considered (tune this)
m = movie_stats['v'].quantile(0.90)   # top 10% most voted movies
# You can also use fixed numbers like:  m = 50 / 100 etc.

# Apply IMDB Weighted Rating formula
# Weighted Rating = (v/(v+m))*R + (m/(v+m))*C
movie_stats['weighted_rating'] = (
    (movie_stats['v']/(movie_stats['v']+m)) * movie_stats['R']
    + (m/(movie_stats['v']+m)) * C
)

In [18]:
movie_stats.shape

(84432, 5)

In [19]:
movie_stats.head()

Unnamed: 0,movieId,title,v,R,weighted_rating
0,1,Toy Story (1995),68997,3.897438,3.89423
1,2,Jumanji (1995),28904,3.275758,3.273447
2,3,Grumpier Old Men (1995),13134,3.139447,3.136948
3,4,Waiting to Exhale (1995),2806,2.845331,2.858347
4,5,Father of the Bride Part II (1995),13154,3.059602,3.058589


In [20]:
df = finalDf.drop(['title'],axis=1)

In [21]:
movie_cat = df['movieId'].astype('category')
user_cat  = df['userId'].astype('category')

df['movie_index'] = movie_cat.cat.codes
df['user_index']  = user_cat.cat.codes



In [22]:
from scipy.sparse import csr_matrix

movie_user_sparse = csr_matrix(
    (df['rating'], (df['movie_index'], df['user_index'])),
    shape=(df['movie_index'].nunique(), df['user_index'].nunique())
)


In [23]:
movieId_to_index = dict(zip(df['movieId'], df['movie_index']))
index_to_movieId = dict(zip(df['movie_index'], df['movieId']))


In [24]:
print(movie_user_sparse)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 32000204 stored elements and shape (84432, 200948)>
  Coords	Values
  (0, 9)	2.5
  (0, 10)	3.0
  (0, 16)	4.0
  (0, 18)	3.0
  (0, 19)	5.0
  (0, 22)	3.0
  (0, 23)	4.0
  (0, 27)	4.0
  (0, 32)	5.0
  (0, 33)	4.0
  (0, 35)	3.0
  (0, 36)	1.0
  (0, 42)	5.0
  (0, 45)	4.0
  (0, 50)	3.5
  (0, 53)	4.0
  (0, 56)	4.0
  (0, 58)	4.0
  (0, 59)	3.0
  (0, 61)	5.0
  (0, 63)	5.0
  (0, 64)	2.0
  (0, 66)	4.0
  (0, 69)	4.0
  (0, 73)	4.5
  :	:
  (84409, 109257)	2.0
  (84410, 41694)	4.5
  (84411, 41225)	4.0
  (84412, 108411)	2.5
  (84412, 165363)	2.0
  (84413, 105978)	2.0
  (84414, 105978)	3.0
  (84415, 105978)	1.0
  (84416, 105978)	2.5
  (84417, 105978)	4.5
  (84418, 105978)	4.0
  (84419, 105978)	4.0
  (84420, 34364)	3.0
  (84421, 111090)	3.0
  (84422, 98334)	3.0
  (84422, 165363)	2.0
  (84423, 15501)	2.0
  (84424, 169557)	1.0
  (84425, 108411)	3.0
  (84426, 132836)	3.5
  (84427, 50684)	4.0
  (84428, 165363)	1.5
  (84429, 150811)	4.0
  (84430, 91096

In [25]:
# def get_similar_movies(movie_id, top_k=20):
#     movie_index = movieId_to_index[movie_id]   # convert
#     movie_vec = movie_user_sparse[movie_index]
#     sim_scores = cosine_similarity(movie_vec, movie_user_sparse).flatten()
#     top_idx = sim_scores.argsort()[-top_k-1:-1][::-1]
#     similar_movie_ids = [index_to_movieId[i] for i in top_idx]
#     return similar_movie_ids


def get_similar_movies(movie_id, top_k=20):
    movie_index = movieId_to_index[movie_id]   # convert movieId → sparse index
    movie_vec = movie_user_sparse[movie_index] # 1 x users
    # -------- cosine numerator = dot products --------
    dot_products = movie_user_sparse @ movie_vec.T     # (n_movies x 1)
    # -------- cosine denominator = norms ------------
    movie_norms = np.sqrt(movie_user_sparse.multiply(movie_user_sparse).sum(axis=1))
    target_norm = np.sqrt(movie_vec.multiply(movie_vec).sum())
    cosine_sim = np.array(dot_products / (movie_norms * target_norm)).reshape(-1)
    # ---- get top k (excluding itself) ----
    top_idx = np.argsort(cosine_sim)[-top_k-1:-1][::-1]
    similar_movie_ids = [index_to_movieId[i] for i in top_idx]
    return similar_movie_ids


In [26]:
def recommend_for_cold_user(movie_id, top_n=10):
    similar_movies = get_similar_movies(movie_id)
    candidates = movie_stats[movie_stats['movieId'].isin(similar_movies)]
    print(candidates)
    candidates = candidates.sort_values(by='weighted_rating', ascending=False)
    return candidates[['movieId','title','v','R','weighted_rating']].head(top_n)

In [27]:
# this is for cold user i.e. for a new user with no history, or he just watched a movie - 'Interstellar'
recommend_for_cold_user(109487)

Empty DataFrame
Columns: [movieId, title, v, R, weighted_rating]
Index: []


Unnamed: 0,movieId,title,v,R,weighted_rating


In [31]:
finalDf[finalDf['movieId'] == 109487].shape


(37157, 4)