In [3]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import mean_squared_error

# load data
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')

print('train size:', len(train))
print('test size:', len(test))

train size: 8196
test size: 2049


In [4]:
# global mean predictor
global_mean = train['rating'].mean()
print('Global Mean:', global_mean)

# user mean
ratingsPerUser = defaultdict(list)
for _, row in train.iterrows():
    ratingsPerUser[row['user_id']].append(row['rating'])

# compute mean for each user
userMean = {}
for u in ratingsPerUser:
    userMean[u] = np.mean(ratingsPerUser[u])
    
# item mean
ratingsPerItem = defaultdict(list)
for _, row in train.iterrows():
    ratingsPerItem[row['item_id']].append(row['rating'])

# compute mean for each item
itemMean = {}
for i in ratingsPerItem:
    itemMean[i] = np.mean(ratingsPerItem[i])

Global Mean: 4.2269399707174236


In [5]:
# Bias Model Structures
ratingsPerUser_bias = defaultdict(list)
ratingsPerItem_bias = defaultdict(list)

for _, row in train.iterrows():
    u = row['user_id']
    i = row['item_id']
    r = row['rating']
    ratingsPerUser_bias[u].append((i,r))
    ratingsPerItem_bias[i].append((u,r))

In [6]:
# Bias Model Training
alpha = global_mean
betaU = defaultdict(float)
betaI = defaultdict(float)
lamb = 5
max_iter = 20

def update_alpha(train, betaU, betaI):
    numer = 0
    for idx, row in train.iterrows():
        u = row['user_id']
        i = row['item_id']
        r = row['rating']
        numer += r - (betaU.get(u,0) + betaI.get(i,0))
    return numer / len(train)

def update_betaU(ratingsPerUser, alpha, betaI, lamb):
    newBetaU = {}
    for u, items in ratingsPerUser.items():
        numer = 0
        for (i, r) in items:
            numer += r - (alpha + betaI.get(i,0))
        newBetaU[u] = numer / (lamb + len(items))
    return newBetaU

def update_betaI(ratingsPerItem, alpha, betaU, lamb):
    newBetaI = {}
    for i, users in ratingsPerItem.items():
        numer = 0
        for (u, r) in users:
            numer += r - (alpha + betaU.get(u,0))
        newBetaI[i] = numer / (lamb + len(users))
    return newBetaI

In [7]:
for t in range(max_iter):
    alpha = update_alpha(train, betaU, betaI)
    betaU = update_betaU(ratingsPerUser_bias, alpha, betaI, lamb)
    betaI = update_betaI(ratingsPerItem_bias, alpha, betaU, lamb)

    # compute train MSE
    mse = 0
    for _, row in train.iterrows():
        u = row['user_id']
        i = row['item_id']
        r = row['rating']
        pred = alpha + betaU.get(u,0) + betaI.get(i,0)
        pred = max(1, min(5, pred))
        mse += (r - pred)**2
    mse /= len(train)

    print(f"Iter {t+1}: alpha={alpha:.2f}, MSE={mse:.2f}")


Iter 1: alpha=4.23, MSE=0.73
Iter 2: alpha=4.21, MSE=0.74
Iter 3: alpha=4.20, MSE=0.74
Iter 4: alpha=4.19, MSE=0.74
Iter 5: alpha=4.19, MSE=0.74
Iter 6: alpha=4.19, MSE=0.74
Iter 7: alpha=4.19, MSE=0.74
Iter 8: alpha=4.19, MSE=0.74
Iter 9: alpha=4.19, MSE=0.74
Iter 10: alpha=4.19, MSE=0.74
Iter 11: alpha=4.19, MSE=0.74
Iter 12: alpha=4.19, MSE=0.74
Iter 13: alpha=4.19, MSE=0.74
Iter 14: alpha=4.19, MSE=0.74
Iter 15: alpha=4.19, MSE=0.74
Iter 16: alpha=4.19, MSE=0.74
Iter 17: alpha=4.19, MSE=0.74
Iter 18: alpha=4.19, MSE=0.74
Iter 19: alpha=4.19, MSE=0.74
Iter 20: alpha=4.19, MSE=0.74


In [8]:
### Baseline Test MSEs
# Global Mean Test MSE
gm_preds = [global_mean] * len(test)
mse_gm = mean_squared_error(test['rating'], gm_preds)
print("Global Mean Test MSE:", mse_gm)

# User Mean Test MSE
user_preds = []
for _, row in test.iterrows():
    u = row['user_id']
    if u in userMean:
        pred = userMean[u]
    else:
        pred = global_mean
    user_preds.append(pred)

mse_user = mean_squared_error(test['rating'], user_preds)
print("User Mean Test MSE:", mse_user)

# Item Mean Test MSE
item_preds = []
for _, row in test.iterrows():
    i = row['item_id']
    if i in itemMean:
        pred = itemMean[i]
    else:
        pred = global_mean
    item_preds.append(pred)

mse_item = mean_squared_error(test['rating'], item_preds)
print("Item Mean Test MSE:", mse_item)

Global Mean Test MSE: 1.5591178641582832
User Mean Test MSE: 1.9124749905308376
Item Mean Test MSE: 1.9867923491024322


In [9]:
# Evaluate Bias Model on Test
test_preds = []

for _, row in test.iterrows():
    u = row['user_id']
    i = row['item_id']
    pred = alpha + betaU.get(u,0) + betaI.get(i,0)
    pred = max(1.0, min(5.0, pred))
    test_preds.append(pred)

mse_test = mean_squared_error(test['rating'], test_preds)
print("Bias model Test MSE:", mse_test)

Bias model Test MSE: 1.437052575369529


In [10]:
results = {
    "Model": ["Global Mean", "User Mean", "Item Mean", "Bias Model"],
    "Test MSE": [mse_gm, mse_user, mse_item, mse_test]
}

df_results = pd.DataFrame(results)

# save
output_path = "../results/tables/baseline_results.csv"
df_results.to_csv(output_path, index=False)

print("Saved baseline table to:", output_path)
df_results


Saved baseline table to: ../results/tables/baseline_results.csv


Unnamed: 0,Model,Test MSE
0,Global Mean,1.559118
1,User Mean,1.912475
2,Item Mean,1.986792
3,Bias Model,1.437053
