# Team composed by:
- Agostara Nicolò
- Fratti Giorgio
- Fusillo Antonio
- Protti Edoardo

### Import libraries

In [1]:
import math
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import time
from utility import Rating_Datset
from neumf import NeuMF

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

## Setting variables and functions

### PATHs

In [2]:
PATH = "./ml-100k/u.data" 
MODEL_PATH = "./models_2/" 

### Hyperparameters

In [11]:
args = {
    "seed": 42,
    "lr": 0.001,
    "dropout": 0.2,
    "batch_size": 256,
    "epochs": 25,
    "top_k": 10,
    "num_factors": 32,
    "layers": (32, 16, 8),
    "out": True,
    "num_ng": 4,
    "num_ng_test": 100
}

### HIT RATE

In [4]:
def hit(ng_item, pred_items):
    return 1 if ng_item in pred_items else 0

def metrics(model, test_loader, top_k, device):
    HR = []

    for user, item, label in test_loader:
        user = user.to(device)
        item = item.to(device)

        predictions = model(user, item)
        _, indices = torch.topk(predictions, top_k)
        recommends = torch.take(item, indices).cpu().numpy().tolist()

        ng_item = item[0].item()  # leave one-out evaluation has only one item per user
        HR.append(hit(ng_item, recommends))

    return np.mean(HR)

### Dataset Preprocessing

In [5]:

class NCF_Data(object):
	"""
	Construct Dataset for NCF
	"""
	def __init__(self, ratings):
		self.ratings = ratings
		self.num_ng = args["num_ng"]
		self.num_ng_test = args["num_ng_test"]
		self.batch_size = args["batch_size"]

		self.preprocess_ratings = self._reindex(self.ratings)

		self.user_pool = set(self.ratings['user_id'].unique())
		self.item_pool = set(self.ratings['item_id'].unique())

		self.train_ratings, self.test_ratings = self._leave_one_out(self.preprocess_ratings)
		self.negatives = self._negative_sampling(self.preprocess_ratings)

	
	def _reindex(self, ratings):
		"""
		Process dataset to reindex userID and itemID, also set rating as binary feedback
		"""
		user_list = list(ratings['user_id'].drop_duplicates())
		user2id = {w: i for i, w in enumerate(user_list)}

		item_list = list(ratings['item_id'].drop_duplicates())
		item2id = {w: i for i, w in enumerate(item_list)}

		ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
		ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
		ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
		return ratings

	def _leave_one_out(self, ratings):
		"""
		leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
		"""
		ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
		test = ratings.loc[ratings['rank_latest'] == 1]
		train = ratings.loc[ratings['rank_latest'] > 1]
		assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
		return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]

	def _negative_sampling(self, ratings):
		interact_status = (
			ratings.groupby('user_id')['item_id']
			.apply(set)
			.reset_index()
			.rename(columns={'item_id': 'interacted_items'}))
		interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
		interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, self.num_ng_test))
		return interact_status[['user_id', 'negative_items', 'negative_samples']]

	def get_train_instance(self):
		users, items, ratings = [], [], []
		train_ratings = pd.merge(self.train_ratings, self.negatives[['user_id', 'negative_items']], on='user_id')
		train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, self.num_ng))
		for row in train_ratings.itertuples():
			users.append(int(row.user_id))
			items.append(int(row.item_id))
			ratings.append(float(row.rating))
			for i in range(self.num_ng):
				users.append(int(row.user_id))
				items.append(int(row.negatives[i]))
				ratings.append(float(0))  # negative samples get 0 rating
		dataset = Rating_Datset(
			user_list=users,
			item_list=items,
			rating_list=ratings)
		return DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=1)

	def get_test_instance(self):
		users, items, ratings = [], [], []
		test_ratings = pd.merge(self.test_ratings, self.negatives[['user_id', 'negative_samples']], on='user_id')
		for row in test_ratings.itertuples():
			users.append(int(row.user_id))
			items.append(int(row.item_id))
			ratings.append(float(row.rating))
			for i in getattr(row, 'negative_samples'):
				users.append(int(row.user_id))
				items.append(int(i))
				ratings.append(float(0))
		dataset = Rating_Datset(
			user_list=users,
			item_list=items,
			rating_list=ratings)
		return DataLoader(dataset, batch_size=self.num_ng_test+1, shuffle=False, num_workers=1)

### Training Pipeline

In [6]:
def train_model(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader):
    model = model.to(device)

    # Loss and optimizer
    loss_function = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=args["lr"])

    best_hr = 0
    
    # Train cycle
    for epoch in range(args["epochs"]+1):
        start_time = time.time()
        
        # Train step
        model.train()

        for user, item, label in train_loader:
            user = user.to(device)
            item = item.to(device)
            label = label.to(device)

            # Zero grad
            optimizer.zero_grad()
            
            # Prediction
            prediction = model(user, item)
            loss = loss_function(prediction, label)
            
            # Backpropagation
            loss.backward()
            optimizer.step()

        # Eval metrics
        model.eval()
        HR = metrics(model, test_loader, args["top_k"], device)

        # Print metrics and time elapsed
        elapsed_time = time.time() - start_time
        print(
            "Epoch {:03d} |".format(epoch)
            + " HR: {:.3f} |".format(np.mean(HR))
            + " time: "
            + time.strftime("%H: %M: %S", time.gmtime(elapsed_time)
            )
        )

        # If best model, save it
        if HR > best_hr:
            best_hr, best_epoch = HR, epoch
            if args["out"]:
                if not os.path.exists(MODEL_PATH):
                    os.mkdir(MODEL_PATH)
                torch.save(
                    model, "{}{}{}.pt".format(MODEL_PATH, model.__class__.__name__, model.num_factors)
                )


### LOAD DATA

In [7]:
# load data
ml_100k = pd.read_csv(
	PATH, 
	sep="\t", 
	names = ['user_id', 'item_id', 'rating', 'timestamp'], 
	engine='python')

# set the num_users, items
num_users = ml_100k['user_id'].nunique()+1
num_items = ml_100k['item_id'].nunique()+1

# construct the train and test datasets
data = NCF_Data(ml_100k)
train_loader = data.get_train_instance()
test_loader = data.get_test_instance()

since Python 3.9 and will be removed in a subsequent version.
  interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, self.num_ng_test))
since Python 3.9 and will be removed in a subsequent version.
  train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, self.num_ng))


## ---
### TASK 1: Train the GMF and MLP separately

In [8]:
# Load the best model from Assignment 1:
best = torch.load("models/NeuMF64_tuned.pt")

### GMF

#### Definition

In [9]:
gmf_factors = best.gmf_user_embed[0].weight.shape[1] # num_factors for GMF model
class GMF(nn.Module):
    def __init__(self, num_users, num_items):
        super(GMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_factors = gmf_factors

        self.embedding_user = nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.num_factors
        )
        self.embedding_item = nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.num_factors
        )

        self.affine_output = nn.Linear(in_features=self.num_factors, out_features=1)
        self.logistic = nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_product = torch.mul(user_embedding, item_embedding)
        logits = self.affine_output(element_product)
        rating = self.logistic(logits)
        return rating.squeeze()



#### Training and saving the GMF

In [None]:
gmf_model = GMF(num_users, num_items)
train_model(gmf_model, train_loader, test_loader)


### MLP

#### Definition

In [12]:
mlp_factors = best.mlp_user_embed[0].weight.shape[1] # num_factors for GMF model
class MLP(nn.Module):
    def __init__(self, num_users, num_items):
        super(MLP, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_factors = mlp_factors

        self.embedding_user = nn.Embedding(
            num_embeddings=num_users, embedding_dim=mlp_factors
        )
        self.embedding_item = nn.Embedding(
            num_embeddings=num_items, embedding_dim=mlp_factors
        )

        layer_sizes = args["layers"]
        layers = []
        layers.append(nn.Linear(mlp_factors * 2, layer_sizes[0]))
        layers.append(nn.ReLU())
        for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:]):
            layers.append(nn.Linear(in_size, out_size))
            layers.append(nn.ReLU())
        self.mlp_fc = nn.Sequential(*layers)
        self.mlp_fc.add_module("affine", nn.Linear(layer_sizes[-1], 1))
        self.mlp_fc.add_module("logit", nn.Sigmoid())

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        rating = self.mlp_fc(vector)
        return rating.squeeze()


#### Training and saving the MLP

In [13]:
mlp_model = MLP(num_users, num_items)
train_model(mlp_model, train_loader, test_loader) # Stopped after 17 epochs to prevent overfitting

Epoch 000 | HR: 0.401 | time: 00: 00: 21
Epoch 001 | HR: 0.408 | time: 00: 00: 22
Epoch 002 | HR: 0.407 | time: 00: 00: 23
Epoch 003 | HR: 0.424 | time: 00: 00: 22
Epoch 004 | HR: 0.444 | time: 00: 00: 21
Epoch 005 | HR: 0.473 | time: 00: 00: 24
Epoch 006 | HR: 0.496 | time: 00: 00: 22
Epoch 007 | HR: 0.514 | time: 00: 00: 23
Epoch 008 | HR: 0.521 | time: 00: 00: 23
Epoch 009 | HR: 0.546 | time: 00: 00: 21
Epoch 010 | HR: 0.557 | time: 00: 00: 21
Epoch 011 | HR: 0.557 | time: 00: 00: 22
Epoch 012 | HR: 0.558 | time: 00: 00: 22
Epoch 013 | HR: 0.568 | time: 00: 00: 21
Epoch 014 | HR: 0.559 | time: 00: 00: 21
Epoch 015 | HR: 0.566 | time: 00: 00: 21
Epoch 016 | HR: 0.562 | time: 00: 00: 22
Epoch 017 | HR: 0.575 | time: 00: 00: 22
Epoch 018 | HR: 0.564 | time: 00: 00: 23
Epoch 019 | HR: 0.563 | time: 00: 00: 22
Epoch 020 | HR: 0.561 | time: 00: 00: 21
Epoch 021 | HR: 0.565 | time: 00: 00: 22
Epoch 022 | HR: 0.571 | time: 00: 00: 22
Epoch 023 | HR: 0.566 | time: 00: 00: 21
Epoch 024 | HR: 

### NeuMF

#### Definition

In [67]:
class NeuMF_class(nn.Module):
    def __init__(self, num_factors_gmf, num_factors_mlp, num_users, num_items):

        super(NeuMF_class, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_factors_gmf = gmf_factors
        self.num_factors_mlp = mlp_factors
        self.num_factors = max(num_factors_mlp, num_factors_gmf) #just for saving model name purpose

        # GMF component
        self.gmf_user_embed = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.num_factors_gmf)
        self.gmf_item_embed = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.num_factors_gmf)
        self.gmf_affine = nn.Linear(
            in_features=self.num_factors_gmf, out_features=self.num_factors_gmf, bias=True
        )

        # MLP component
        self.mlp_user_embed = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.num_factors_mlp)
        self.mlp_item_embed = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.num_factors_mlp)

        self.mlp_fc = nn.Sequential(
            #We have considered as possible num_factors [8, 16, 32, 64] so this structure works
            nn.Linear(2 * self.num_factors_mlp, self.num_factors_mlp),
            nn.ReLU(),

            nn.Linear(self.num_factors_mlp, int(self.num_factors_mlp / 2)),
            nn.ReLU(),

            nn.Linear(int(self.num_factors_mlp / 2), int(self.num_factors_mlp / 4)),  
            nn.ReLU()
        )

        # Combine models
        input_dim = self.num_factors_gmf + int(self.num_factors_mlp / 4)
        self.mixing_layers = nn.Sequential(
            nn.Linear(input_dim, int(input_dim / 2)),
            nn.ReLU(),

            nn.Linear(int(input_dim / 2), int(input_dim / 4)),
            nn.ReLU(),

            nn.Linear(int(input_dim / 4), 1),
            nn.Sigmoid(),
        )

    def forward(self, user_indices, item_indices):
        # GMF forward
        user_embedding_gmf = self.gmf_user_embed(user_indices)
        item_embedding_gmf = self.gmf_item_embed(item_indices)

        element_product = torch.mul(user_embedding_gmf, item_embedding_gmf)
        ratings_gmf = self.gmf_affine(element_product)

        # MLP forward
        user_embedding_mlp = self.mlp_user_embed(user_indices)
        item_embedding_mlp = self.mlp_item_embed(item_indices)

        vector = torch.cat((user_embedding_mlp, item_embedding_mlp), dim=-1)
        ratings_mlp = self.mlp_fc(vector)

        # Combine
        ratings = torch.cat((ratings_gmf, ratings_mlp), dim=1)
        return self.mixing_layers(ratings).squeeze()



In [73]:
neuMF_f = NeuMF_class(gmf_factors, mlp_factors, num_users, num_items)
neuMF_nof = NeuMF_class(gmf_factors, mlp_factors, num_users, num_items)


In [68]:
gmf_model.embedding_user.weight

Parameter containing:
tensor([[ 1.6831,  0.4672,  0.8025,  ..., -0.8291, -0.4639,  0.9350],
        [-0.0033, -0.4051, -0.8245,  ..., -2.2704,  0.6260,  0.5048],
        [ 0.8177, -0.9323, -1.4887,  ..., -0.5728,  1.4165,  0.0176],
        ...,
        [-0.7927,  0.9772,  1.5114,  ..., -1.5120,  1.2837,  0.6163],
        [-0.8549, -1.4636,  0.0792,  ..., -1.8420,  1.5156,  1.4550],
        [-0.1027, -1.4767, -1.4588,  ...,  1.4242,  1.0193,  1.3637]],
       requires_grad=True)

In [81]:
neuMF_f.gmf_user_embed.weight = gmf_model.embedding_user.weight
neuMF_f.gmf_user_embed.requires_grad_(False)
neuMF_f.gmf_item_embed.weight = gmf_model.embedding_item.weight
neuMF_f.gmf_item_embed.requires_grad_(False)


Embedding(1683, 8)

In [87]:
mlp_model.mlp_fc


Sequential(
  (0): Linear(in_features=128, out_features=32, bias=True)
  (1): Linear(in_features=32, out_features=16, bias=True)
  (2): ReLU()
  (3): Linear(in_features=16, out_features=8, bias=True)
  (4): ReLU()
  (affine): Linear(in_features=8, out_features=1, bias=True)
  (logit): Sigmoid()
)

In [84]:
neuMF_f.mlp_user_embed.weight = mlp_model.embedding_user.weight
neuMF_f.mlp_user_embed.requires_grad_(False)
neuMF_f.mlp_item_embed.weight = mlp_model.embedding_item.weight
neuMF_f.mlp_item_embed.requires_grad_(False)

Embedding(1683, 64)