# **Introduction**

The dataset consists of reviews for products from Amazon. We have the training and the testing data files which have been provided to us. The goal of this implementaation is to create a recommmender system using the training data and use the model to generate predicted ratings for each user-item pair.



---



# **Loading libraries used in this code file**

## **Installing Libraries**

Normal installation of Numpy was not compatible with surprise library. Hence, I had to downgrade the version of numpy

In [None]:
!pip uninstall numpy
!pip install numpy==1.24.4

Scikit-surprise : Used to run SVD Model

In [None]:
!pip install scikit-surprise

Installing torch library

In [None]:
!pip install torch
!pip install lightgbm

## **Importing Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, Reader, SVD, accuracy, BaselineOnly, KNNBasic, NMF, SlopeOne, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import time

# **Loading the Data Files**

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# **Exploratory Data Analysis**

In [None]:
train_df.head()

In [None]:
test_df.head()

## **Count of Ratings**

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(train_df['rating'], bins=5, kde=False, color='steelblue', edgecolor='black')
plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

## **Sparsity Calculation**

In [None]:
n_users = train_df['user_id'].nunique()
n_items = train_df['product_id'].nunique()
n_ratings = len(train_df)

sparsity = 1.0 - (n_ratings / (n_users * n_items))
print(f"Number of Users: {n_users}")
print(f"Number of Products: {n_items}")
print(f"Total Ratings: {n_ratings}")
print(f"Matrix Sparsity: {sparsity:.4f}")

## **Votes vs Helpful Votes Relation**

In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x='votes', y='helpful_votes', data=train_df, alpha=0.5)
plt.title("Votes vs Helpful Votes")
plt.xlabel("Votes")
plt.ylabel("Helpful Votes")
plt.tight_layout()
plt.show()

# Correlation value
correlation = train_df[['votes', 'helpful_votes']].corr().iloc[0, 1]
print(f"Correlation between Votes and Helpful Votes: {correlation:.3f}")

## **Average Rating across helpful votes**

In [None]:
# Filter to non-zero vote entries to avoid noise
filtered = train_df[train_df['votes'] > 0].copy()
filtered['helpfulness_ratio'] = filtered['helpful_votes'] / filtered['votes']

# Bin into ranges
filtered['helpfulness_bin'] = pd.cut(filtered['helpfulness_ratio'], bins=[0, 0.25, 0.5, 0.75, 1.0])

# Average rating by helpfulness bin
avg_rating_by_helpfulness = filtered.groupby('helpfulness_bin')['rating'].mean().reset_index()

plt.figure(figsize=(8, 4))
sns.barplot(x='helpfulness_bin', y='rating', data=avg_rating_by_helpfulness, palette='Blues')
plt.title("Average Rating by Helpfulness Ratio")
plt.xlabel("Helpfulness Ratio Bin")
plt.ylabel("Average Rating")
plt.tight_layout()
plt.show()

# **Data Preprocessing**

In [None]:
# Remove duplicate user-item ratings
train_df.drop_duplicates(subset=['user_id', 'product_id'], inplace=True)

In [None]:
# Confirm no missing values
print(train_df.isnull().sum())

# Ensure ratings are numeric and in expected range
print(train_df['rating'].describe())

# Optional: drop out-of-range if needed
train_df = train_df[train_df['rating'].between(1, 5)]

# **Data Modelling**

## **Data Preparation**

In [None]:
# Define the rating scale for Surprise's Reader
reader = Reader(rating_scale=(1, 5))

# Convert data into a Surprise Dataset
data = Dataset.load_from_df(train_df[['user_id', 'product_id', 'rating']], reader)

# Split the dataset into training and testing sets using Surprise's train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Converts the full dataset into a Surprise Trainset object
trainset = data.build_full_trainset()

In [None]:
print(f"Number of users: {train_df['user_id'].nunique()}")
print(f"Number of products: {train_df['product_id'].nunique()}")
print(f"Number of ratings: {len(train_df)}")



---



## **NormalPredictor**

In [None]:
npstart = time.time()

model = NormalPredictor()
model.fit(trainset)

npend = time.time()
# Make predictions on test data
predictions = model.test(test)

# Compute RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE on validation set: {rmse:.4f}")

print(f"Training Time: {npend-npstart} seconds")



---



## **Baseline Model**

### **BaselineOnly Model**

In [None]:
bslstart = time.time()
# Initializing BaselineOnly Model
bsl_model = BaselineOnly()

# Training
bsl_model.fit(train)

bslend = time.time()
# Predict and evaluate
bsl_preds = bsl_model.test(test)
bsl_rmse = accuracy.rmse(bsl_preds)
print(f"BaselineOnly RMSE: {bsl_rmse:.4f}")
print(f"Execution Time: {bslend-bslstart} seconds")

### **BaselineOnly Hyperparameter Tuning using Grid search with Cross Validation**

In [None]:
param_grid = {
    'bsl_options': {
        'method': ['sgd', 'als'],
        'learning_rate': [0.001, 0.005],
        'n_epochs': [10, 20],
        'reg': [0.02, 0.05, 0.1]
    }
}

gs_bsl = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=1)
gs_bsl.fit(data)

In [None]:
print(f"Best RMSE: {gs_bsl.best_score['rmse']:.4f}")
print("Best Params:", gs_bsl.best_params['rmse'])

In [None]:
tbslstart = time.time()
# Retrive the best model
tuned_bsl = gs_bsl.best_estimator['rmse']

# Training
tuned_bsl.fit(trainset)

tbslend = time.time()

# Predictions
bsl_preds = [
    tuned_bsl.predict(uid, iid).est
    for uid, iid in zip(test_df['user_id'], test_df['product_id'])
]

# Save predictions
test_df['rating'] = np.clip(bsl_preds, 1, 5)
test_df[['ID', 'rating']].to_csv("baseline_tuned.csv", index=False)
print(f"Execution Time: {tbslend-tbslstart} seconds")



---



## **Matrix Factorization Models**

### **SVD (Singular Value Decomposition)**

In [None]:
# SVD Model Training
svdstart = time.time()
svd_model = SVD()
svd_model.fit(train)
svdend = time.time()

predictions = svd_model.test(test)
accuracy.rmse(predictions)
print(f"Execution Time: {svdend-svdstart} seconds")

In [None]:
# Predict
predicted_ratings = [
    svd_model.predict(uid, iid).est
    for uid, iid in zip(test_df['user_id'], test_df['product_id'])
]

# Create submission
test_df['rating'] = predicted_ratings
submission = test_df[['ID', 'rating']]
submission.to_csv("Base_SVD.csv", index=False)

### **Fine tuning SVD**

In [None]:
param_grid = {
    'n_factors': [50, 100, 150],
    'lr_all': [0.005, 0.007, 0.01],
    'reg_all': [0.02, 0.05, 0.08]
}

# Run grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=1)
gs.fit(data)

# Display best result
print("Best RMSE:", gs.best_score['rmse'])
print("Best Parameters:", gs.best_params['rmse'])

In [None]:
# Tune the base SVD
tuned_svd = SVD(n_factors=150, lr_all=0.01, reg_all=0.05)
tsvd = time.time()
tuned_svd.fit(trainset)
tsvdend = time.time()

predictions = tuned_svd.test(test)
accuracy.rmse(predictions)
print(f"Execution Time: {tsvdend-tsvd} seconds")

In [None]:
test_preds = [
    tuned_svd.predict(uid, iid).est
    for uid, iid in zip(test_df['user_id'], test_df['product_id'])
]

# Clip to valid rating range
test_df['rating'] = np.clip(test_preds, 1, 5)
test_df['rating'] = np.floor(test_preds)

In [None]:
test_df[['ID', 'rating']].to_csv("tuned_svd.csv", index=False)



---



## **Ensemble**

In [None]:
# Define ensemble function
def ensemble_predict(uid, iid, w_svd=0.6,w_bsl=0.4):
    pred_svd = tuned_svd.predict(uid, iid).est  # 60% Weight to SVD
    pred_bsl = bsl_model.predict(uid, iid).est  # 10% Weight to BaselineOnly
    return w_svd * pred_svd + w_bsl * pred_bsl

ensstart = time.time()
# Generate predictions
ensemble_preds = [
    ensemble_predict(uid, iid)
    for uid, iid in zip(test_df['user_id'], test_df['product_id'])
]
ensend = time.time()
# Save to CSV
test_df['rating'] = ensemble_preds
test_df[['ID', 'rating']].to_csv("ensemble3.csv", index=False)
print(f"Execution Time: {ensend-ensstart} seconds")



---



# **Appendix**

The below blocks of code was implemented to test various models. However, due to being computationally heavy, were not successfully implemented. The outputs for these blocks were cleared as they never executed completely and due to presence of the interrupt error, I cleared the output for these but kept the code blocks intact.

## **Memory Based Collaborative Filtering (KNN)**

In [None]:
# Define similarity measure
sim_options = {
    'name': 'cosine',
    'user_based': False  # False = item-based
}

# Selecting KNNBasic Model
knn_model = KNNBasic(sim_options=sim_options)

# Training
knn_model.fit(train)

# Predict and evaluate
knn_preds = knn_model.test(test)
knn_rmse = accuracy.rmse(knn_preds)
print(f"KNNBasic (Item-Based) RMSE: {knn_rmse:.4f}")

## **NMF (Non-Negative Matrix Factorization)**

In [None]:
# Initialize the NMF Model
nmf_model = NMF()

# Training
nmf_model.fit(train)

# Predict and evaluate
nmf_preds = nmf_model.test(test)
nmf_rmse = accuracy.rmse(nmf_preds)
print(f"NMF RMSE: {nmf_rmse:.4f}")

### **Hyperparameter Tuning (GridSearch)**

In [None]:
# Defining the parameters grid
param_grid = {
    'n_factors': [50, 100, 150],
    'reg_pu': [0.02, 0.05],
    'reg_qi': [0.02, 0.05],
    'biased': [True, False]
}

In [None]:
# Set up GridSearchCV for NMF
gs_nmf = GridSearchCV(
    NMF, param_grid,
    measures=['rmse'], cv=3,
    n_jobs=-1,        # Use all cores
    joblib_verbose=2  # More detailed logs
)
gs_nmf.fit(data)
print("Best RMSE:", gs_nmf.best_score['rmse'])
print("Best Parameters:", gs_nmf.best_params['rmse'])

In [None]:
# Best score and params
print("Best NMF RMSE:", gs_nmf.best_score['rmse'])
print("Best NMF params:", gs_nmf.best_params['rmse'])



---



## **NCF**

In [None]:
# Encode user and item IDs
user2idx = {u: i for i, u in enumerate(train_df['user_id'].unique())}
item2idx = {i: j for j, i in enumerate(train_df['product_id'].unique())}
train_df['user_idx'] = train_df['user_id'].map(user2idx)
train_df['item_idx'] = train_df['product_id'].map(item2idx)

# PyTorch Data creation Pipeline
class NCFDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

# Splitting Train and Validation Data
train_set, val_set = train_test_split(train_df, test_size=0.2, random_state=42)

# Train set
train_dataset = NCFDataset(train_set)

# Validation Set
val_dataset = NCFDataset(val_set)

# Wrapping in dataloaders for batch training
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=512)

### **Model ready for training**

In [None]:
class NCF(nn.Module):
  # Initializing
    def __init__(self, n_users, n_items, emb_size=50):
        super(NCF, self).__init__()

        # Embedding Layers
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)

        # Fully Connected Neural Network
        self.fc1 = nn.Linear(emb_size * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.out = nn.Linear(64, 1)

    # Forward Pass
    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i], dim=-1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x).squeeze()

In [None]:
# Setting GPU or else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Initialization
model = NCF(len(user2idx), len(item2idx)).to(device)

# Loss Function and Optimizer(Adam)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Training loop
for epoch in range(10):   # Running the loop for 10 epochs
    model.train()
    total_loss = 0
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        preds = model(users, items)
        loss = criterion(preds, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Train RMSE: {total_loss/len(train_loader):.4f}")

In [None]:
# Initializing evaluation of model
model.eval()

# List to store predicted and target values
val_preds = []
val_targets = []

# Looping through validation set
with torch.no_grad():
    for users, items, ratings in val_loader:
        users, items = users.to(device), items.to(device)
        preds = model(users, items).cpu().numpy()
        val_preds.extend(preds)
        val_targets.extend(ratings.numpy())

# Evaluation
val_rmse = np.sqrt(np.mean((np.array(val_preds) - np.array(val_targets)) ** 2))
print(f"Validation RMSE: {val_rmse:.4f}")



---



### **NCF with Dropout Regularization**

In [None]:
class NCF(nn.Module):
    def __init__(self, n_users, n_items, emb_size=16):
        super(NCF, self).__init__()

        # Embedding layers
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)

        # Fully connected layer and dropout
        self.fc1 = nn.Linear(emb_size * 2, 64)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(64, 1)

    # Forward Pass
    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i], dim=-1)
        x = torch.relu(self.dropout(self.fc1(x)))

        # Output Layer
        return self.out(x).squeeze()

In [None]:
model = NCF(len(user2idx), len(item2idx)).to(device)

# Loss criteria and Optimizer (Adam)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)

#
for epoch in range(10):
    model.train()
    total_loss = 0
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)

        # Forward Pass -> Compute Loss -> Backpropogation
        preds = model(users, items)
        loss = criterion(preds, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Train RMSE: {total_loss / len(train_loader):.4f}")

In [None]:
# Setting model to evaluation mode
model.eval()

# Creating list for predictions and targets
val_preds, val_targets = [], []
with torch.no_grad():
    # Looping without gradient decent
    for users, items, ratings in val_loader:
        users, items = users.to(device), items.to(device)
        preds = model(users, items).cpu().numpy()
        val_preds.extend(preds)
        val_targets.extend(ratings.numpy())

# Evaluation
val_rmse = np.sqrt(np.mean((np.array(val_preds) - np.array(val_targets)) ** 2))
print(f"Validation RMSE: {val_rmse:.4f}")

In [None]:
# Encode using train mappings
test_df['user_idx'] = test_df['user_id'].map(user2idx)
test_df['item_idx'] = test_df['product_id'].map(item2idx)

# Handle new/unseen users/items
test_df['user_idx'] = test_df['user_idx'].fillna(0).astype(int)
test_df['item_idx'] = test_df['item_idx'].fillna(0).astype(int)

In [None]:
model.eval()
# Testing Tensors
test_users = torch.tensor(test_df['user_idx'].values, dtype=torch.long).to(device)
test_items = torch.tensor(test_df['item_idx'].values, dtype=torch.long).to(device)

with torch.no_grad():
    # Predictions without gradients
    test_preds = model(test_users, test_items).cpu().numpy()
    test_preds = np.clip(test_preds, 1, 5)

test_df['rating'] = test_preds

In [None]:
# Creating prediction file
test_df[['ID', 'rating']].to_csv("ncf_tuned.csv", index=False)



---

