# 1. Data Loading and Exploration

In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import random

In [13]:

ratings = pd.read_csv(
    "/content/mk100/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"]
)


movies = pd.read_csv(
    "/content/mk100/u.item",
    sep="|",
    encoding="latin-1",
    header=None,
    usecols=[0, 1],
    names=["movie_id", "title"]
)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
num_users = ratings["user_id"].nunique()
num_movies = ratings["movie_id"].nunique()
num_ratings = len(ratings)

print("Number of users:", num_users)
print("Number of movies:", num_movies)
print("Number of ratings:", num_ratings)


print("\nMissing values per column:")
print(ratings.isnull().sum())

Number of users: 943
Number of movies: 1682
Number of ratings: 100000

Missing values per column:
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64


# 2. Data Preprocessing

In [15]:

user_item = ratings.pivot(
    index="user_id",
    columns="movie_id",
    values="rating"
)


In [16]:

min_ratings = 5
valid_users = user_item.count(axis=1) >= min_ratings
user_item = user_item.loc[valid_users]


In [17]:

user_means = user_item.mean(axis=1)
user_item_normalized = user_item.sub(user_means, axis=0)


# 3. Trainâ€“Test Split (Hidden Ratings)


In [18]:
random.seed(42)

users = user_item.index.tolist()
num_test_users = int(0.2 * len(users))
test_users = random.sample(users, num_test_users)

train_matrix = user_item.copy()
hidden_ratings = []
for user in test_users:
    user_ratings = user_item.loc[user].dropna()


    if len(user_ratings) < 5:
        continue

    hidden = user_ratings.sample(frac=0.2, random_state=42)

    for movie_id, rating in hidden.items():
        hidden_ratings.append((user, movie_id, rating))
        train_matrix.loc[user, movie_id] = np.nan

print("Total hidden ratings:", len(hidden_ratings))

Total hidden ratings: 3775


# 4. User-Based Collaborative Filtering (KNN)

Similarity Computation (Cosine)

In [19]:

train_means = train_matrix.mean(axis=1)
train_normalized = train_matrix.sub(train_means, axis=0)


train_filled = train_normalized.fillna(0)


user_similarity = cosine_similarity(train_filled)
user_similarity = pd.DataFrame(
    user_similarity,
    index=train_filled.index,
    columns=train_filled.index
)
print(user_similarity.iloc[:5, :5])

user_id         1         2         3         4         5
user_id                                                  
1        1.000000  0.043411  0.011051  0.053735  0.134514
2        0.043411  1.000000  0.013658  0.030801  0.035770
3        0.011051  0.013658  1.000000 -0.051579  0.016037
4        0.053735  0.030801 -0.051579  1.000000  0.008871
5        0.134514  0.035770  0.016037  0.008871  1.000000


Rating Prediction Function (Manual KNN)

In [20]:
def predict_rating(user_id, movie_id, k):

    rated_users = train_matrix[movie_id].dropna().index

    if len(rated_users) == 0:
        return train_means[user_id]

    similarities = user_similarity.loc[user_id, rated_users]

    top_k_users = similarities.sort_values(ascending=False).head(k)

    numerator = 0.0
    denominator = 0.0

    for neighbor in top_k_users.index:
        sim = top_k_users[neighbor]
        rating = train_matrix.loc[neighbor, movie_id]
        numerator += sim * (rating - train_means[neighbor])
        denominator += abs(sim)

    if denominator == 0:
        return train_means[user_id]

    return train_means[user_id] + numerator / denominator

# 5. Evaluation

RMSE / MSE Computation

In [21]:

def evaluate(k):
    y_true = []
    y_pred = []

    for user_id, movie_id, true_rating in hidden_ratings:
        pred = predict_rating(user_id, movie_id, k)
        y_true.append(true_rating)
        y_pred.append(pred)

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return mse, rmse

Experiments with Different k

In [22]:

for k in [5, 10, 20]:
    mse, rmse = evaluate(k)
    print(f"k = {k} | MSE = {mse:.4f} | RMSE = {rmse:.4f}")

k = 5 | MSE = 0.9213 | RMSE = 0.9599
k = 10 | MSE = 0.8706 | RMSE = 0.9331
k = 20 | MSE = 0.8473 | RMSE = 0.9205


Compare Predicted Ratings vs. Actual Ratings (Test Set)

In [23]:

k_eval = 10

comparison_data = []
for user_id, movie_id, true_rating in hidden_ratings:
    predicted_rating = predict_rating(user_id, movie_id, k=k_eval)
    comparison_data.append([
        user_id,
        movie_id,
        true_rating,
        predicted_rating
    ])

comparison_df = pd.DataFrame(
    comparison_data,
    columns=["user_id", "movie_id", "actual_rating", "predicted_rating"]
)

print("\nPredicted vs. Actual Ratings (Test Set Sample):\n")
print(comparison_df.head(20).to_string(index=False))


Predicted vs. Actual Ratings (Test Set Sample):

 user_id  movie_id  actual_rating  predicted_rating
     655       511            3.0          3.650915
     655      1113            3.0          2.705572
     655       316            4.0          3.221141
     655       581            2.0          2.358218
     655       251            3.0          3.879997
     655       972            3.0          2.732763
     655       291            3.0          2.421205
     655       698            4.0          2.594833
     655       210            3.0          2.785797
     655      1208            3.0          2.428941
     655      1121            3.0          3.011883
     655       789            3.0          2.987149
     655       318            4.0          4.041670
     655       149            4.0          2.794502
     655       479            4.0          3.626545
     655       505            3.0          3.265693
     655       474            3.0          4.274230
     655      

In [24]:
from collections import defaultdict
k_eval = 10


user_errors = defaultdict(list)
movie_errors = defaultdict(list)

for user_id, movie_id, true_rating in hidden_ratings:
    predicted = predict_rating(user_id, movie_id, k=k_eval)
    abs_error = abs(true_rating - predicted)

    user_errors[user_id].append(abs_error)
    movie_errors[movie_id].append(abs_error)

avg_error_per_user = {user: np.mean(errors) for user, errors in user_errors.items()}


avg_error_per_movie = {movie: np.mean(errors) for movie, errors in movie_errors.items()}


print("\n=== Average Absolute Error per User (Sample) ===")
for user_id in sorted(list(avg_error_per_user.keys()))[:15]:
    print(f"User {user_id:3d} | Avg Error: {avg_error_per_user[user_id]:.4f}")

print("\n=== Average Absolute Error per Movie (Sample) ===")
for movie_id in sorted(list(avg_error_per_movie.keys()))[:15]:
    print(f"Movie {movie_id:4d} | Avg Error: {avg_error_per_movie[movie_id]:.4f}")


hardest_movies = sorted(avg_error_per_movie.items(), key=lambda x: x[1], reverse=True)[:10]
print("\n=== Hardest Movies to Predict (Top 10 by Avg Error) ===")
for movie_id, avg_error in hardest_movies:
    print(f"Movie {movie_id:4d} | Avg Error: {avg_error:.4f}")


=== Average Absolute Error per User (Sample) ===
User   4 | Avg Error: 0.7026
User   7 | Avg Error: 0.7369
User  12 | Avg Error: 0.5712
User  26 | Avg Error: 0.5471
User  28 | Avg Error: 0.7216
User  31 | Avg Error: 0.7777
User  33 | Avg Error: 0.5042
User  45 | Avg Error: 0.5198
User  47 | Avg Error: 0.6321
User  49 | Avg Error: 0.7912
User  58 | Avg Error: 0.9427
User  66 | Avg Error: 1.2216
User  68 | Avg Error: 1.0011
User  72 | Avg Error: 0.7086
User  74 | Avg Error: 0.4480

=== Average Absolute Error per Movie (Sample) ===
Movie    1 | Avg Error: 0.5396
Movie    2 | Avg Error: 0.2271
Movie    3 | Avg Error: 0.8837
Movie    4 | Avg Error: 0.4837
Movie    5 | Avg Error: 1.2614
Movie    7 | Avg Error: 0.8462
Movie    8 | Avg Error: 1.0354
Movie    9 | Avg Error: 0.5980
Movie   10 | Avg Error: 0.6505
Movie   11 | Avg Error: 0.8151
Movie   12 | Avg Error: 0.7066
Movie   13 | Avg Error: 0.2127
Movie   14 | Avg Error: 0.7339
Movie   15 | Avg Error: 0.5675
Movie   17 | Avg Error: 0.2897

# Bonus: Item-Based Collaborative Filtering and Comparison

In [25]:

item_means = train_matrix.mean(axis=0)
train_matrix_item_norm = train_matrix.sub(item_means, axis=1)


train_matrix_item_filled = train_matrix_item_norm.fillna(0)


item_similarity = cosine_similarity(train_matrix_item_filled.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix_item_filled.columns,
    columns=train_matrix_item_filled.columns
)
print(item_similarity_df.iloc[:5, :5])

print("\n=== Hardest Movies to Predict (Top 10 by Avg Error) ===")
hardest_movies = sorted(avg_error_per_movie.items(), key=lambda x: x[1], reverse=True)[:10]
for movie_id, avg_error in hardest_movies:
    print(f"Movie {movie_id:4d} | Avg Error: {avg_error:.4f}")

movie_id         1         2         3         4         5
movie_id                                                  
1         1.000000  0.090505  0.076852  0.040600  0.118551
2         0.090505  1.000000  0.073904  0.136429  0.062874
3         0.076852  0.073904  1.000000 -0.052489  0.028386
4         0.040600  0.136429 -0.052489  1.000000 -0.118517
5         0.118551  0.062874  0.028386 -0.118517  1.000000

=== Hardest Movies to Predict (Top 10 by Avg Error) ===
Movie  553 | Avg Error: 3.3628
Movie 1037 | Avg Error: 3.1803
Movie  253 | Avg Error: 3.1291
Movie 1598 | Avg Error: 3.0555
Movie 1245 | Avg Error: 2.9526
Movie   74 | Avg Error: 2.8782
Movie  171 | Avg Error: 2.8221
Movie 1620 | Avg Error: 2.7682
Movie  422 | Avg Error: 2.7542
Movie  355 | Avg Error: 2.7433


In [26]:

def predict_rating_item(user_id, movie_id, k=10):

    rated_movies = train_matrix.loc[user_id].dropna().index
    if movie_id not in item_similarity_df.columns:
        return train_matrix.loc[user_id].mean()
    if len(rated_movies) == 0:
        return train_matrix.loc[user_id].mean()

    similarities = item_similarity_df.loc[movie_id, rated_movies]
    top_k_movies = similarities.sort_values(ascending=False).head(k)

    numerator = 0
    denominator = 0
    for m in top_k_movies.index:
        rating = train_matrix.loc[user_id, m]
        sim = top_k_movies[m]
        numerator += sim * (rating - train_matrix.loc[user_id, rated_movies].mean())
        denominator += abs(sim)

    if denominator == 0:
        return train_matrix.loc[user_id].mean()

    return train_matrix.loc[user_id, rated_movies].mean() + numerator / denominator


In [27]:

y_true = []
y_pred_user = []
y_pred_item = []

for user_id, movie_id, true_rating in hidden_ratings:
    y_true.append(true_rating)
    y_pred_user.append(predict_rating(user_id, movie_id, k=10))
    y_pred_item.append(predict_rating_item(user_id, movie_id, k=10))


mse_user = mean_squared_error(y_true, y_pred_user)
rmse_user = np.sqrt(mse_user)

mse_item = mean_squared_error(y_true, y_pred_item)
rmse_item = np.sqrt(mse_item)

print("=== Collaborative Filtering Comparison ===")
print(f"User-Based CF  | MSE: {mse_user:.4f} | RMSE: {rmse_user:.4f}")
print(f"Item-Based CF  | MSE: {mse_item:.4f} | RMSE: {rmse_item:.4f}")

=== Collaborative Filtering Comparison ===
User-Based CF  | MSE: 0.8706 | RMSE: 0.9331
Item-Based CF  | MSE: 0.9073 | RMSE: 0.9525


In [28]:

comparison = pd.DataFrame({
    "user_id": [u for u, m, r in hidden_ratings],
    "movie_id": [m for u, m, r in hidden_ratings],
    "actual": y_true,
    "pred_user": y_pred_user,
    "pred_item": y_pred_item
})


comparison["better_model"] = np.where(
    abs(comparison["actual"] - comparison["pred_user"]) < abs(comparison["actual"] - comparison["pred_item"]),
    "User-Based",
    "Item-Based"
)

better_counts = comparison["better_model"].value_counts()
print("\nNumber of ratings better predicted by each model:")
print(better_counts)


Number of ratings better predicted by each model:
better_model
User-Based    1922
Item-Based    1853
Name: count, dtype: int64
