# Homework 5

## option 4 - collaborative filtering

### 1 - Create the Dataset (Movies + User Ratings)

In [8]:
# Build movies list and random user–movie rating matrix

import numpy as np
import pandas as pd
import random

random.seed(42)  # for reproducibility

# 20 movies from different genres
movies = [
    "Inception", "The Godfather", "Toy Story", "The Dark Knight", "Titanic",
    "The Matrix", "Forrest Gump", "The Lion King", "Pulp Fiction", "Avengers",
    "Frozen", "The Shawshank Redemption", "Interstellar", "Parasite",
    "Joker", "Gladiator", "Coco", "Spirited Away", "La La Land", "Mad Max"
]

num_movies = len(movies)
num_users = 15

# Users U1 ... U15
users = [f"U{i+1}" for i in range(num_users)]

# Rating matrix: rows = users, columns = movies, NaN = no rating
ratings = pd.DataFrame(np.nan, index=users, columns=movies)

def assign_ratings(user_idx, min_movies, max_movies):
    """
    Assign random ratings (1–5) to a random subset of movies for a given user.
    """
    n = random.randint(min_movies, max_movies)
    movie_idxs = random.sample(range(num_movies), n)
    for mi in movie_idxs:
        ratings.iat[user_idx, mi] = random.randint(1, 5)

# According to the exercise:
# * 5 users with 8–10 rated movies
for u in range(5):     # U1..U5
    assign_ratings(u, 8, 10)

# * 5 users with 4–6 rated movies
for u in range(5, 10): # U6..U10
    assign_ratings(u, 4, 6)

# * 5 new users with 2–3 rated movies
for u in range(10, 15): # U11..U15
    assign_ratings(u, 2, 3)

ratings


Unnamed: 0,Inception,The Godfather,Toy Story,The Dark Knight,Titanic,The Matrix,Forrest Gump,The Lion King,Pulp Fiction,Avengers,Frozen,The Shawshank Redemption,Interstellar,Parasite,Joker,Gladiator,Coco,Spirited Away,La La Land,Mad Max
U1,5.0,2.0,1.0,1.0,,,,1.0,4.0,,5.0,2.0,,,,,1.0,5.0,,
U2,,,,,3.0,,2.0,3.0,,2.0,,,,3.0,,2.0,,1.0,,4.0
U3,,,1.0,5.0,5.0,,,,,1.0,,1.0,4.0,,,,4.0,,3.0,
U4,,4.0,3.0,2.0,3.0,,4.0,,,,3.0,1.0,3.0,,,2.0,,,,2.0
U5,,,3.0,,3.0,1.0,4.0,2.0,2.0,,1.0,,,,,,3.0,1.0,,2.0
U6,,,3.0,,,,2.0,,,,3.0,,5.0,,5.0,2.0,,,,
U7,,,2.0,,,,,1.0,,,,1.0,1.0,4.0,,,,,5.0,
U8,,,3.0,,,4.0,,,,1.0,,,5.0,5.0,,,1.0,,,
U9,,,,3.0,,,2.0,,4.0,5.0,1.0,,,,,,,2.0,,
U10,,,,2.0,5.0,3.0,1.0,,,5.0,,,,,,,5.0,,,


### 2 - Normalize Ratings + Compute Similarity Between First 10 Users

In [10]:
# Normalization (mean-centering) and user–user similarity

import math

# Mean rating per user (ignoring NaNs)
mean_ratings = ratings.mean(axis=1, skipna=True)

# Center by subtracting mean rating per user
centered = ratings.sub(mean_ratings, axis=0)

# Fill NaN with 0 after centering
centered_filled = centered.fillna(0.0)

def cosine_sim(v1, v2):
    """Compute cosine similarity between two 1D numpy arrays."""
    num = float(np.dot(v1, v2))
    den = math.sqrt(float(np.dot(v1, v1))) * math.sqrt(float(np.dot(v2, v2)))
    if den == 0:
        return 0.0
    return num / den


# Find top 3 similar pairs among U1..U10
pairs = []

for i in range(10):  # U1..U10
    for j in range(i + 1, 10):
        v_i = centered_filled.iloc[i].values
        v_j = centered_filled.iloc[j].values
        sim = cosine_sim(v_i, v_j)
        pairs.append(((users[i], users[j]), sim))

top3_pairs = sorted(pairs, key=lambda x: x[1], reverse=True)[:3]

print("Top 3 most similar user pairs among U1..U10:")
for (u1, u2), s in top3_pairs:
    print(f"{u1} - {u2}: similarity = {s:.3f}")


Top 3 most similar user pairs among U1..U10:
U9 - U10: similarity = 0.392
U4 - U5: similarity = 0.295
U6 - U10: similarity = 0.277


### 3 - Recommendations for New Users (U11–U15)

In [12]:
# Recommend movies for the new users

def recommend_for_new_user(new_user_idx, k_neighbors=3, n_recs=3):
    """
    Recommend movies for a new user based on k most similar users
    among the first 10 users.
    """
    new_vec = centered_filled.iloc[new_user_idx].values

    # Similarity to first 10 users
    sims = []
    for i in range(10):
        sim = cosine_sim(new_vec, centered_filled.iloc[i].values)
        sims.append((i, sim))

    # Sort by similarity descending and keep positive similarities
    sims_sorted = sorted(sims, key=lambda x: x[1], reverse=True)
    neighbors = [x for x in sims_sorted if x[1] > 0][:k_neighbors]

    if not neighbors:
        return neighbors, []

    preds = {}

    for m_idx, movie in enumerate(movies):
        if not math.isnan(ratings.iat[new_user_idx, m_idx]):
            continue  # already rated

        num = 0.0
        den = 0.0
        for u_idx, sim in neighbors:
            r = ratings.iat[u_idx, m_idx]
            if not math.isnan(r):
                num += sim * r
                den += abs(sim)

        if den > 0:
            preds[movie] = num / den

    # Choose top recommendations
    recs = sorted(preds.items(), key=lambda x: x[1], reverse=True)[:n_recs]
    return neighbors, recs


# Apply recommendations for U11..U15
for idx in range(10, 15):
    user_id = users[idx]
    neighbors, recs = recommend_for_new_user(idx)

    print(f"\n=== Recommendations for {user_id} ===")

    print("Closest users:")
    for u_idx, sim in neighbors:
        print(f"  {users[u_idx]} with similarity {sim:.3f}")

    print("Recommended movies:")
    for movie, score in recs:
        print(f"  {movie}: predicted rating {score:.2f}")



=== Recommendations for U11 ===
Closest users:
  U4 with similarity 0.422
  U1 with similarity 0.387
  U3 with similarity 0.302
Recommended movies:
  Spirited Away: predicted rating 5.00
  Forrest Gump: predicted rating 4.00
  Pulp Fiction: predicted rating 4.00

=== Recommendations for U12 ===
Closest users:
  U5 with similarity 0.183
  U4 with similarity 0.075
Recommended movies:
  The Godfather: predicted rating 4.00
  Forrest Gump: predicted rating 4.00
  Titanic: predicted rating 3.00

=== Recommendations for U13 ===
Closest users:
  U2 with similarity 0.577
  U5 with similarity 0.456
  U3 with similarity 0.302
Recommended movies:
  The Dark Knight: predicted rating 5.00
  Interstellar: predicted rating 4.00
  Coco: predicted rating 3.40

=== Recommendations for U14 ===
Closest users:
  U1 with similarity 0.387
  U8 with similarity 0.373
  U9 with similarity 0.251
Recommended movies:
  Inception: predicted rating 5.00
  Interstellar: predicted rating 5.00
  Parasite: predicted ra

### **Brief Explanation – Collaborative Filtering Implementation**

In this exercise we created a small recommendation system using user–user collaborative filtering.  
We first generated a dataset of 20 movies and 15 users, where each user rated a random subset of movies according to the assignment rules. The ratings matrix contains numeric values (1–5) and missing values for unrated movies.

To prepare the data, we performed **mean-centering**, subtracting each user's average rating from all of their ratings. Missing values were then filled with 0 so that cosine similarity could be computed properly.

Next, we calculated **cosine similarity** between the first 10 users and selected the **top 3 most similar user pairs**. This identifies users with similar rating behavior.

For the 5 new users (who rated very few movies), we computed their similarity to the first 10 users, selected the **3 nearest neighbors**, and then predicted ratings for movies they had not yet seen using a weighted average based on similarity strengths. Finally, we recommended the top 2–3 movies with the highest predicted rating for each new user.

This demonstrates the basic workflow of a collaborative filtering recommender system:  
data preparation → normalization → similarity calculation → neighbor selection → prediction → recommendations.
