In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
finalDf = pd.read_csv("dataset.csv")

In [3]:
movies_data = pd.read_csv('dataset/movies.csv')

In [4]:
# ---------------- IMDB WEIGHTED RATING ----------------
movie_stats = finalDf.groupby(["movieId", "title"]).agg(
    v=("rating", "count"),
    R=("rating", "mean")
).reset_index()

C = movie_stats["R"].mean()
m = movie_stats["v"].quantile(0.90)

movie_stats["weighted_rating"] = (
    (movie_stats["v"] / (movie_stats["v"] + m)) * movie_stats["R"]
    + (m / (movie_stats["v"] + m)) * C
)

In [5]:
df = finalDf.drop(["title"], axis=1)

df["movie_index"] = df["movieId"].astype("category").cat.codes
df["user_index"] = df["userId"].astype("category").cat.codes

movie_user_sparse = csr_matrix(
    (df["rating"], (df["movie_index"], df["user_index"])),
    shape=(df["movie_index"].nunique(), df["user_index"].nunique())
)

movieId_to_index = dict(zip(df["movieId"], df["movie_index"]))
index_to_movieId = dict(zip(df["movie_index"], df["movieId"]))

In [6]:
# ---------------- GENRE TF-IDF MATRIX ----------------
movie_genres = finalDf[["movieId", "genres"]].drop_duplicates().reset_index(drop=True)

tfidf = TfidfVectorizer(token_pattern=r"[A-Za-z\-]+")
genre_matrix = tfidf.fit_transform(movie_genres["genres"])

genre_id_to_row = dict(zip(movie_genres["movieId"], movie_genres.index))

In [13]:
print(movie_user_sparse)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 31944706 stored elements and shape (77877, 200948)>
  Coords	Values
  (0, 9)	2.5
  (0, 10)	3.0
  (0, 16)	4.0
  (0, 18)	3.0
  (0, 19)	5.0
  (0, 22)	3.0
  (0, 23)	4.0
  (0, 27)	4.0
  (0, 32)	5.0
  (0, 33)	4.0
  (0, 35)	3.0
  (0, 36)	1.0
  (0, 42)	5.0
  (0, 45)	4.0
  (0, 50)	3.5
  (0, 53)	4.0
  (0, 56)	4.0
  (0, 58)	4.0
  (0, 59)	3.0
  (0, 61)	5.0
  (0, 63)	5.0
  (0, 64)	2.0
  (0, 66)	4.0
  (0, 69)	4.0
  (0, 73)	4.5
  :	:
  (77854, 139165)	2.0
  (77855, 109257)	2.0
  (77856, 41694)	4.5
  (77857, 41225)	4.0
  (77858, 108411)	2.5
  (77858, 165363)	2.0
  (77859, 105978)	2.0
  (77860, 105978)	3.0
  (77861, 105978)	1.0
  (77862, 105978)	2.5
  (77863, 105978)	4.5
  (77864, 105978)	4.0
  (77865, 105978)	4.0
  (77866, 34364)	3.0
  (77867, 111090)	3.0
  (77868, 98334)	3.0
  (77868, 165363)	2.0
  (77869, 15501)	2.0
  (77870, 108411)	3.0
  (77871, 132836)	3.5
  (77872, 50684)	4.0
  (77873, 165363)	1.5
  (77874, 150811)	4.0
  (77875, 91096

In [7]:
def get_cf_similar_movies(movie_id, top_k=80):
    if movie_id not in movieId_to_index:
        return []

    idx = movieId_to_index[movie_id]
    vec = movie_user_sparse[idx]

    if vec.nnz == 0:
        return []

    sim = cosine_similarity(vec, movie_user_sparse).flatten()
    top_idx = sim.argsort()[-top_k-1:-1][::-1]

    return [(index_to_movieId[i], float(sim[i])) for i in top_idx]

# ---------------- GENRE SIMILARITY ----------------
def get_genre_similar_movies(movie_id, top_k=80):
    if movie_id not in genre_id_to_row:
        return []

    row = genre_id_to_row[movie_id]
    sim = cosine_similarity(genre_matrix[row], genre_matrix).flatten()

    top_idx = sim.argsort()[-top_k-1:-1][::-1]
    return [(movie_genres.iloc[i]["movieId"], float(sim[i])) for i in top_idx]

# ---------------- HYBRID RECOMMENDER ----------------
def recommend_genre_aware_hybrid(movie_ids, top_n=5,alpha=0.5, beta=0.3, gamma=0.2):
    scores = {}
    for mid in movie_ids:
        # CF similarity
        for m, s in get_cf_similar_movies(mid):
            if m not in movie_ids:
                scores[m] = scores.get(m, 0) + alpha * s

        # Genre similarity
        for m, s in get_genre_similar_movies(mid):
            if m not in movie_ids:
                scores[m] = scores.get(m, 0) + beta * s

    if not scores:
        return movie_stats.sort_values(
            by="weighted_rating", ascending=False
        )[["movieId", "title", "weighted_rating"]].head(top_n)

    # Convert to DataFrame
    score_df = pd.DataFrame(scores.items(), columns=["movieId", "hybrid_score"])

    # Merge IMDB confidence
    rec = movie_stats.merge(score_df, on="movieId")

    rec["final_score"] = rec["hybrid_score"] + gamma * rec["weighted_rating"]

    rec = rec.sort_values("final_score", ascending=False)

    return rec[["movieId", "title", "weighted_rating", "final_score"]].head(top_n)
    # return rec[["movieId"]].head(top_n)

In [8]:
# ---------------- RUN ----------------
ids = [2959, 2985, 3197]   # watched movies

In [9]:

# titles = finalDf[finalDf["movieId"].isin(ids)]["title"].tolist()
titles = []
for i  in ids:
    title = movies_data[movies_data['movieId'] == i]['title'].iloc[0]
    titles.append(title)
print(f"\nðŸŽ¬ Recommendations for movies: {titles}\n")


ðŸŽ¬ Recommendations for movies: ['Fight Club (1999)', 'RoboCop (1987)', 'Presidio, The (1988)']



In [10]:
recommendations = recommend_genre_aware_hybrid(ids)
# titles = movie_stats[movie_stats["movieId"].isin(recommendations)]["title"].tolist()

In [11]:
recommendations

Unnamed: 0,movieId,title,weighted_rating,final_score
113,2571,"Matrix, The (1999)",4.152785,1.366993
7,293,LÃ©on: The Professional (a.k.a. The Professiona...,4.08605,1.366759
34,1206,"Clockwork Orange, A (1971)",3.97756,1.301341
31,1196,Star Wars: Episode V - The Empire Strikes Back...,4.125716,1.289158
37,1214,Alien (1979),4.061428,1.272615
