### Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv('/Users/lamhoichun/Desktop/STAT3009/Lecture/train.csv')
test_df = pd.read_csv('/Users/lamhoichun/Desktop/STAT3009/Lecture/test.csv')

In [3]:
train_df = train_df.drop(columns=['Unnamed: 0'], errors='ignore')
test_df = test_df.drop(columns=['Unnamed: 0'], errors='ignore')

X_train = train_df[['user_id', 'item_id']].values
y_train = train_df['rating'].values

X_test = test_df[['user_id', 'item_id']].values
y_test = test_df['rating'].values if 'rating' in test_df.columns else None

n_user = max(train_df['user_id'].max(), test_df['user_id'].max()) + 1
n_item = max(train_df['item_id'].max(), test_df['item_id'].max()) + 1

### User Average RS

In [4]:
# Define the class for User Average RS
class UserAvg(BaseEstimator):
    def __init__(self, n_user, min_data=0):
        self.n_user = n_user
        self.min_data = min_data
        self.global_avg = 0
        self.user_avg = None

    def fit(self, X, y):
        self.global_avg = np.mean(y)
        self.user_avg = np.zeros(self.n_user)
        for user_temp in range(self.n_user):
            user_index = np.where(X[:,0] == user_temp)[0]
            if len(user_index) <= self.min_data:
                self.user_avg[user_temp] = self.global_avg
            else:
                self.user_avg[user_temp] = np.mean(y[user_index])
        return self

    def predict(self, X):
        return self.user_avg[X[:,0]]

In [5]:
# GridSearchCV for UserAvg
user_model = UserAvg(n_user=n_user)
param_grid_user = {'min_data': [0, 1, 2, 3, 5, 10]}

grid_user = GridSearchCV(user_model, param_grid_user, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_user.fit(X_train, y_train)

print("Best UserAvg params:", grid_user.best_params_)
print("Best UserAvg RMSE:", -grid_user.best_score_)

Best UserAvg params: {'min_data': 10}
Best UserAvg RMSE: 0.4758547556814469


In [6]:
best_user_model = grid_user.best_estimator_
user_preds = best_user_model.predict(X_test)

### Item Average RS

In [7]:
# Class for Item Average RS
class ItemAvg(BaseEstimator):
    def __init__(self, n_item, min_data=0):
        self.n_item = n_item
        self.min_data = min_data
        self.global_avg = 0
        self.item_avg = None

    def fit(self, X, y):
        self.global_avg = np.mean(y)
        self.item_avg = np.zeros(self.n_item)
        for item_temp in range(self.n_item):
            item_index = np.where(X[:,1] == item_temp)[0]
            if len(item_index) <= self.min_data:
                self.item_avg[item_temp] = self.global_avg
            else:
                self.item_avg[item_temp] = np.mean(y[item_index])
        return self

    def predict(self, X):
        return self.item_avg[X[:,1]]



# ===============================
# Define RMSE scorer
# ===============================
# rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)


In [8]:
# GridSearchCV for ItemAvg
item_model = ItemAvg(n_item=n_item)
param_grid_item = {'min_data': [0, 1, 2, 3, 5, 10]}

grid_item = GridSearchCV(item_model, param_grid_item, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_item.fit(X_train, y_train)

print("Best ItemAvg params:", grid_item.best_params_)
print("Best ItemAvg RMSE:", -grid_item.best_score_)

Best ItemAvg params: {'min_data': 0}
Best ItemAvg RMSE: 0.12554968649627257


In [9]:
# Grid-Search for user_average model

# Import the library for the GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
hp_grid = {'min_data': [0,1,2,3]}

myuser = UserAvg(n_user=n_user)
gs_user = GridSearchCV(estimator = myuser, 
                        param_grid = hp_grid, 
                        cv = 5, 
                        scoring = 'neg_root_mean_squared_error')

# Fit the model and make predictions
gs_user.fit(X_train, y_train)
y_pred = gs_user.predict(X_test)

# Print the best hyper-parameters
print(f"Best parameters: {gs_user.best_params_}")
# Print the best estimator
print(f"Best estimator: {gs_user.best_estimator_}")

Best parameters: {'min_data': 0}
Best estimator: UserAvg(n_user=np.int64(199))


In [None]:
best_item_model = grid_item.best_estimator_
item_preds = best_item_model.predict(X_test)

# Save predictions
test_df['user_pred'] = user_preds
test_df['item_pred'] = item_preds
print(test_df.head())

   user_id  item_id  user_pred  item_pred
0      101       54   1.262194   1.581640
1       18       81   1.516016   1.908355
2       28       16   1.413403   0.954393
3      152       63   1.421607   1.726933
4       22       15   1.478470   0.893804


### Hybrid Model

In [11]:
# Class for Hybrid Model
class HybridAvg(BaseEstimator):
    def __init__(self, n_user, n_item, min_data_user=0, min_data_item=0, alpha=0.5):
        self.n_user = n_user
        self.n_item = n_item
        self.min_data_user = min_data_user
        self.min_data_item = min_data_item
        self.alpha = alpha
        self.user_model = None
        self.item_model = None

    def fit(self, X, y):
        # Train user model
        self.user_model = UserAvg(n_user=self.n_user, min_data=self.min_data_user)
        self.user_model.fit(X, y)

        # Train item model
        self.item_model = ItemAvg(n_item=self.n_item, min_data=self.min_data_item)
        self.item_model.fit(X, y)

        return self

    def predict(self, X):
        user_preds = self.user_model.predict(X)
        item_preds = self.item_model.predict(X)
        return self.alpha * user_preds + (1 - self.alpha) * item_preds

In [14]:
param_grid_hybrid = {
    'min_data_user': [0, 2, 5, 10],
    'min_data_item': [0, 2, 5, 10],
    'alpha': [0.0, 0.25, 0.5, 0.75, 1.0]
}

hybrid_model = HybridAvg(n_user=n_user, n_item=n_item)

grid_hybrid = GridSearchCV(hybrid_model, param_grid_hybrid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_hybrid.fit(X_train, y_train)

print("Best Hybrid params:", grid_hybrid.best_params_)
print("Best Hybrid RMSE:", -grid_hybrid.best_score_)

# Evaluate on test set
best_hybrid = grid_hybrid.best_estimator_
hybrid_preds = best_hybrid.predict(X_test)

if y_test is not None:
    print("Test RMSE Hybrid:", np.sqrt(mean_squared_error(y_test, hybrid_preds)))

test_df['hybrid_pred'] = hybrid_preds
print(test_df.head())


Best Hybrid params: {'alpha': 0.0, 'min_data_item': 0, 'min_data_user': 0}
Best Hybrid RMSE: 0.12554968649627257
   user_id  item_id  user_pred  item_pred  hybrid_pred
0      101       54   1.262194   1.581640     1.581640
1       18       81   1.516016   1.908355     1.908355
2       28       16   1.413403   0.954393     0.954393
3      152       63   1.421607   1.726933     1.726933
4       22       15   1.478470   0.893804     0.893804
