In [1]:
# Import neccessary libraries
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.base import BaseEstimator

In [2]:
# Import the dataset from kaggle compeition: CUHK_STAT3009_2025
import pandas as pd
test_data = pd.read_csv('/Users/lamhoichun/Desktop/STAT3009/Lecture/test.csv')
train_data = pd.read_csv('/Users/lamhoichun/Desktop/STAT3009/Lecture/train.csv') 

# Check the first few rows of the train_data
print(train_data)
print(test_data)

      Unnamed: 0  user_id  item_id    rating
0              0        9       55  1.807853
1              1      142       54  1.450524
2              2      126       23  0.972784
3              3        1       12  1.413113
4              4      164       57  1.467390
...          ...      ...      ...       ...
3512        3512      187       28  1.034759
3513        3513       92       93  2.022145
3514        3514      152       69  1.688438
3515        3515      138       70  1.666404
3516        3516       35       86  2.132165

[3517 rows x 4 columns]
     Unnamed: 0  user_id  item_id
0             0      101       54
1             1       18       81
2             2       28       16
3             3      152       63
4             4       22       15
..          ...      ...      ...
875         875       62       62
876         876        7       16
877         877      189       32
878         878      130       59
879         879      108       80

[880 rows x 3 columns]


In [3]:
test_data = test_data.values
train_data = train_data.values
X_train = train_data[:,1:3]
y_train = train_data[:,3]
X_test = test_data[:,1:3]

In [4]:
# Base Estimator Class: User Average RS
class UserAvg (BaseEstimator):
    def __init__(self, n_user, min_data=0):
        # Setup parameters
        self.n_user = n_user
        # Fitting parameters
        self.global_avg = 0
        self.user_avg = np.zeros(n_user)
        # Hyper-parameters
        self.min_data = 3
    
    def fit(self, X, y):
        # Find the global average
        self.global_avg = np.mean(y)

        # Find the user average
        for user_temp in range(self.n_user):
            # Find the index of the user
            # user_index: the records of that user
            user_index = np.where(X[:,0] == user_temp)[0]
            if len(user_index) <= self.min_data:
                # Use global average when a user has less than min_data records
                self.user_avg[user_temp] = self.global_avg
            else:
                # Compute the user average of existing users
                self.user_avg[user_temp] = np.mean(y[user_index])
        return self
    
    def predict(self, X):
        # Get the user_index
        user_index = X[:,0]-1
        return self.user_avg[user_index]

        # Make predictions
        # y = np.ones(len(X))
        # return y * self.user_avg[X[:,0]]    

In [5]:
n_user = len(set(set(X_train[:,0]).union(set(X_test[:,0]))))
print(f"Number of users: {n_user}")

myuser = UserAvg(n_user=n_user)
myuser.fit(X_train, y_train)
y_pred = myuser.predict(X_test)


Number of users: 198


In [6]:
# Store y_pred in submission.csv with column Id and rating
submission = pd.DataFrame({'Id': test_data[:,0], 'rating': y_pred})
submission.to_csv('submission.csv', index=False)


In [7]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
hp_grid = {'min_data': [0, 3, 5, 10]}

myuser = UserAvg(n_user=n_user)
gs_user = GridSearchCV(estimator = myuser, 
                        param_grid = hp_grid, 
                        cv = 3, 
                        scoring = 'neg_root_mean_squared_error')

# Fit the model and make predictions
gs_user.fit(X_train, y_train)
y_pred = gs_user.predict(X_test)

# Print the best hyper-parameters
print(f"Best parameters: {gs_user.best_params_}")
# Print the best estimator
print(f"Best estimator: {gs_user.best_estimator_}")

Best parameters: {'min_data': 0}
Best estimator: UserAvg(n_user=198)


Traceback (most recent call last):
  File "/Users/lamhoichun/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/lamhoichun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/lamhoichun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
  File "/Users/lamhoichun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
  File "/Users/lamhoichun/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_response.py", line 242, in _get_response_values
    y_pred, pos_label = prediction_method(X), None
  File "/var/folders/6n/9c4tnn9s11jd3vyn0hknk12h0000gn/T/ipykernel_15361

In [8]:
# Compute the n_item correctly - considering all unique items from both train and test
n_item = len(set(list(X_train[:,1]) + list(X_test[:,1])))
print(f"Number of items: {n_item}")

# Base Estimator Class: Item Average RS  
class ItemAvg (BaseEstimator):
    def __init__(self, min_data=0):
        # We'll determine n_item dynamically
        self.min_data = min_data
        self.global_avg = 0
        self.item_avg = None
        self.item_to_idx = {}
    
    def fit(self, X, y):
        # Find the global average
        self.global_avg = np.mean(y)

        # Get unique items and create mapping
        unique_items = sorted(set(X[:,1]))
        self.item_to_idx = {item_id: idx for idx, item_id in enumerate(unique_items)}
        
        # Initialize item averages array
        self.item_avg = np.zeros(len(unique_items))

        # Compute averages for each item
        for i, item_id in enumerate(unique_items):
            item_records = np.where(X[:,1] == item_id)[0]
            if len(item_records) <= self.min_data:
                self.item_avg[i] = self.global_avg
            else:
                self.item_avg[i] = np.mean(y[item_records])
        return self
    
    def predict(self, X):
        item_ids = X[:,1]
        predictions = []
        for item_id in item_ids:
            if item_id in self.item_to_idx:
                predictions.append(self.item_avg[self.item_to_idx[item_id]])
            else:
                # Use global average for completely unseen items
                predictions.append(self.global_avg)
        return np.array(predictions)

Number of items: 98


In [9]:
# Predict the ratings using item average model
myitem = ItemAvg(min_data=0)  # Remove n_item parameter since we'll compute it dynamically
myitem.fit(X_train, y_train)
y_pred = myitem.predict(X_test)

# Store y_pred in submission.csv with column Id and rating
submission = pd.DataFrame({'Id': test_data[:,0], 'rating': y_pred})  # Use test_data[:,0] for Id column
submission.to_csv('submission.csv', index=False)

In [10]:
# Base Estimator Class: Item Average RS with dual thresholds
class ItemAvg (BaseEstimator):
    def __init__(self, min_data=0, min_item=0):
        # Hyper-parameters
        self.min_data = min_data
        self.min_item = min_item
        # Fitting parameters (will be set during fit)
        self.global_avg = 0
        self.item_avg = None
        self.item_to_idx = {}
    
    def fit(self, X, y):
        # Find the global average
        self.global_avg = np.mean(y)

        # Get unique items
        unique_items = sorted(set(X[:,1]))
        self.item_to_idx = {item_id: idx for idx, item_id in enumerate(unique_items)}
        
        # Initialize item averages array
        self.item_avg = np.zeros(len(unique_items))

        # Compute averages for each item
        for i, item_id in enumerate(unique_items):
            item_records = np.where(X[:,1] == item_id)[0]
            
            if len(item_records) <= self.min_data:
                # Use global average when an item has less than min_data records
                self.item_avg[i] = self.global_avg
            elif len(item_records) < self.min_item:
                # Use global average when item doesn't meet min_item threshold
                self.item_avg[i] = self.global_avg
            else:
                # Compute the item average
                self.item_avg[i] = np.mean(y[item_records])
        
        return self
    
    def predict(self, X):
        item_ids = X[:,1]
        predictions = []
        
        for item_id in item_ids:
            if item_id in self.item_to_idx:
                predictions.append(self.item_avg[self.item_to_idx[item_id]])
            else:
                predictions.append(self.global_avg)
        
        return np.array(predictions)

In [11]:
# Grid search with corrected ItemAvg class
hp_grid = {'min_data': [0,1,2,3,10,100]}

myitem = ItemAvg()
gs_item = GridSearchCV(estimator=myitem, 
                        param_grid=hp_grid, 
                        cv=5, 
                        scoring='neg_root_mean_squared_error')
gs_item.fit(X_train, y_train)

y_pred = gs_item.predict(X_test)
submission = pd.DataFrame({'Id': test_data[:,0], 'rating': y_pred})
submission.to_csv('submission.csv', index=False)

In [12]:
# Fit the model and make predictions
gs_item.fit(X_train, y_train)
y_pred = gs_item.predict(X_test)

# Save y_pred in submission.csv with column Id and rating
submission = pd.DataFrame({'Id': test_data[:,0], 'rating': y_pred})
submission.to_csv('submission.csv', index=False)


In [13]:
gs_item.best_params_

{'min_data': 0}