Data Preprocessing

In [1]:
import random
import numpy as np
import pandas as pd
import torch
from sklearn.utils import shuffle

def map_interest_rating(value): # re-evaluate rank to level of relevance
    if value == 0:
        return 0
    elif 0 < value <= 1/3:
        return 1
    elif 1/3 < value <= 2/3:
        return 2
    elif 2/3 < value <= 1:
        return 3
    else:
        return value

class NFC_Data(object): # class for data roganization
  def __init__(self, args, ratings):
    self.ratings = ratings
    self.num_batchSize_test_valid = args["num_batchSize_test_valid"] # batch size of test and validation data -- removed
    self.batch_size = args["batch_size"]
    self.preprocess_ratings = self._reindex(self.ratings) # re-index users and items
    self.user_pool = set(self.ratings['User_ID'].unique())
    self.item_pool = set(self.ratings['Group_ID'].unique())
    self.train_ratings, self.test_ratings, self.validation_ratings = self.split_data(self.preprocess_ratings) # splitting data to validation and test and train dataSet
    random.seed(args["seed"])

  def _reindex(self, ratings):  # re-index users and items by unique id
    user2id = {w: i for i, w in enumerate(ratings['User_ID'].unique())}
    item2id = {w: i for i, w in enumerate(ratings['Group_ID'].unique())}
    ratings['User_ID'] = ratings['User_ID'].map(user2id)
    ratings['Group_ID'] = ratings['Group_ID'].map(item2id)
    return ratings

  def split_data(self, ratings):
    # split data set to two group ( rating between 10 and 20 that estimate 20% of total dataSet and rest of dataSet )        
    result = (ratings[(ratings['User_ID'].groupby(ratings['User_ID']).transform('size') >= 10) & (ratings['User_ID'].groupby(ratings['User_ID']).transform('size') <= 20)])
    
    # Find unique User_IDs
    unique_user_ids = result['User_ID'].unique()

    # Shuffle the unique User_IDs to get randomness
    np.random.shuffle(unique_user_ids)

    # Create two DataFrames based on the split User_IDs
    validation_data = result[result['User_ID'].isin(unique_user_ids[:(len(unique_user_ids) // 2)])]

    test_data = result[result['User_ID'].isin(unique_user_ids[(len(unique_user_ids) // 2):])]
    train_data = (ratings.subtract(validation_data, fill_value=0)).subtract(test_data, fill_value=0)
    
    # re-evaluate rating of test and validation Set because of measurement metrics
    test_data.loc[:, 'Interest_Rate'] = test_data['Interest_Rate'].apply(map_interest_rating)
    validation_data.loc[:, 'Interest_Rate'] = validation_data['Interest_Rate'].apply(map_interest_rating)
    
    return (
        train_data[['User_ID', 'Group_ID', 'Interest_Rate']],
        test_data[['User_ID', 'Group_ID', 'Interest_Rate']],
        validation_data[['User_ID', 'Group_ID', 'Interest_Rate']]
    )

  def get_train_instance(self):
    users, items, ratings= [], [], []
    for row in self.train_ratings.itertuples():
      users.append(int(row.User_ID))
      items.append(int(row.Group_ID))
      ratings.append(float(row.Interest_Rate))
    dataset = Rating_Datset(
        user_list=users,
        item_list=items,
        rating_list=ratings
    )
    return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
  
  def get_test_instance(self):
    return self.test_ratings

  def get_validation_instance(self):
    return self.validation_ratings

class Rating_Datset(torch.utils.data.Dataset):
	def __init__(self, user_list, item_list, rating_list):
		super(Rating_Datset, self).__init__()
		self.user_list = user_list
		self.item_list = item_list
		self.rating_list = rating_list

	def __len__(self):
		return len(self.user_list)

	def __getitem__(self, idx):
		user = self.user_list[idx]
		item = self.item_list[idx]
		rating = self.rating_list[idx]
		
		return (
			torch.tensor(user, dtype=torch.long),
			torch.tensor(item, dtype=torch.long),
			torch.tensor(rating, dtype=torch.float)
			)

Neural Network Matrix factorization Collaborative Filtering Model Building

In [2]:
import torch
import torch.nn as nn


# Custom RMSE Loss Function
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, y_pred, y_true):
        criterion = torch.nn.MSELoss()
        loss = torch.sqrt(criterion(y_pred, y_true))
        return loss

class Generalized_Matrix_Factorization(nn.Module):
    def __init__(self, args, num_users, num_items):
        super(Generalized_Matrix_Factorization, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.factor_num = args["factor_num"]

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num)

        self.affine_output = nn.Linear(in_features=self.factor_num, out_features=1)
        self.logistic = nn.Sigmoid()

    def _forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_product = torch.mul(user_embedding, item_embedding)
        logits = self.affine_output(element_product)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass

class Multi_Layer_Perceptron(nn.Module):
    def __init__(self, args, num_users, num_items):
        super(Multi_Layer_Perceptron, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.factor_num = args["factor_num"]
        self.layers = args["layers"]

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num)

        self.fc_layers = nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(self.layers[:-1], self.layers[1:])):
            self.fc_layers.append(nn.Linear(in_size, out_size))

        self.affine_output = nn.Linear(in_features=self.layers[-1], out_features=1)
        self.logistic = nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
        for idx, _ in enumerate(range(len(self.fc_layers))):
            vector = self.fc_layers[idx](vector)
            vector = nn.ReLU()(vector)
            # vector = nn.BatchNorm1d()(vector)
            # vector = nn.Dropout(p=0.5)(vector)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass



class NeuMF(nn.Module):
    def __init__(self, args, num_users, num_items):
        super(NeuMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.factor_num_mf = args["factor_num"]
        self.factor_num_mlp =  int(args["layers"][0]/2)
        self.layers = args["layers"]
        self.dropout = args["dropout"]

        self.embedding_user_mlp = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mlp)
        self.embedding_item_mlp = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mlp)

        self.embedding_user_mf = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mf)
        self.embedding_item_mf = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mf)

        self.fc_layers = nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(args["layers"][:-1], args["layers"][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))
            self.fc_layers.append(nn.ReLU())

        self.affine_output = nn.Linear(in_features=args["layers"][-1] + self.factor_num_mf, out_features=1)
        self.logistic = nn.Sigmoid()
        self.init_weight()

    def init_weight(self):
        nn.init.normal_(self.embedding_user_mlp.weight, std=0.01)
        nn.init.normal_(self.embedding_item_mlp.weight, std=0.01)
        nn.init.normal_(self.embedding_user_mf.weight, std=0.01)
        nn.init.normal_(self.embedding_item_mf.weight, std=0.01)
        
        for m in self.fc_layers:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                
        nn.init.xavier_uniform_(self.affine_output.weight)

        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)

        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)  # the concat latent vector
        mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)

        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)

        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating.squeeze()

Pytorch Config

In [3]:
import os
import random
import numpy as np 
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

Evaluation Metrics

In [4]:
import numpy as np
import torch

def RMSE(predictions, ground_truth_reate):
    squared_diff = (predictions - ground_truth_reate) ** 2
    mean_squared_diff = np.mean(squared_diff)
    return np.sqrt(mean_squared_diff)

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

def ndcg_at_k(r, ideal_rate_set, k):
    dcg_max = dcg_at_k(ideal_rate_set, k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

def correlation_rank(predictions):
    _, indices = torch.topk(predictions, len(predictions))
    squareDifferences = np.array([pow(i - indices[i], 2) for i in range(len(indices))])
    return 1 - ((6 * np.sum(squareDifferences)) / (len(predictions) * (pow(len(predictions), 2) - 1)))

def precision_at_10(indices, ground_truth_reate):
    relevance = [index for index in indices if ground_truth_reate[index] >= 2]
    return len(relevance)/len(indices)
    

def evaluate(model, data, top_k, device):
    NDCG, rank_correlation, p_10, rmse = [], [], [], []
    unique_user_ids = data['User_ID'].unique()
    
    for user_id in unique_user_ids:
        # get data of user include ["User_ID", "Group_ID", "Interest_Rate"]
        ground_truth = (data[data['User_ID'] == user_id]).sort_values(by='Interest_Rate', ascending=False)
        
        # creating tensor for items that user ahs given rate them and same as its size create tensor for user
        itemsTensor = torch.tensor(np.array(ground_truth["Group_ID"]))
        userTensor = torch.full((len(ground_truth["Group_ID"]),), user_id)
        
        itemsTensor = itemsTensor.to(device)
        userTensor = userTensor.to(device)
        
        predictions = model(userTensor, itemsTensor)
        _, indices = torch.topk(predictions, top_k)
        rate_recommends = torch.take(torch.tensor(np.array(ground_truth["Interest_Rate"])), indices).cpu().numpy().tolist()
        
        rank_correlation.append(correlation_rank(predictions))
        
        rmse.append(RMSE(predictions.detach().numpy(), np.array(ground_truth["Interest_Rate"])))
        
        p_10.append(precision_at_10(indices, np.array(ground_truth["Interest_Rate"])))
        
        NDCG.append(ndcg_at_k(rate_recommends, np.array(ground_truth["Interest_Rate"]), 10))
    
    return np.mean(NDCG), np.mean(rank_correlation), np.mean(p_10), np.mean(rmse)

Main Implementation

In [None]:
import os
import time
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from tensorboardX import SummaryWriter


# set device and parameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()
args = {
    "seed": 42,
    "lr": 0.001,
    "dropout": 0.2,
    "batch_size": 102400,
    "epochs": 10,
    "top_k": 10,
    "factor_num": 64,
    "layers": [64,32,16,8],
    "num_batchSize_test_valid": 100,
    "out": True
}
# seed for Reproducibility
seed_everything(args["seed"])

# load data
ml_1m = pd.read_csv('/kaggle/input/smart-postchin-data/Smart_Postchin_Data.csv')

# set the num_users, items
num_users = ml_1m['User_ID'].nunique()+1
num_items = ml_1m['Group_ID'].nunique()+1

# construct the train and test datasets
data = NFC_Data(args, ml_1m)

train_loader =data.get_train_instance()
validation_loader = data.get_validation_instance()
test_loader =data.get_test_instance()


# set model and loss, optimizer
model = NeuMF(args, num_users, num_items)
model = model.to(device)
loss_function = RMSELoss()  # Using the custom RMSE loss function
optimizer = optim.Adam(model.parameters(), lr=args["lr"])

# train, evaluation
best_hr = 0
for epoch in range(1, args["epochs"]+1):
	model.train()
	start_time = time.time()

	for user, item, label in train_loader:
		user = user.to(device)
		item = item.to(device)
		label = label.to(device)
        
		optimizer.zero_grad()
		prediction = model(user, item)
		loss = loss_function(prediction, label)
		loss.backward()
		optimizer.step()
		writer.add_scalar('loss/Train_loss', loss.item(), epoch)
    
	model.eval()
	NDCG, rank_correlation, p_10, rmse = evaluate(model, validation_loader, args["top_k"], device)

	elapsed_time = time.time() - start_time
	print("The time elapse of epoch {:03d}".format(epoch) + " is: " + 
			time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
	print("NDCG: {:.3f}\trank_correlation: {:.3f}\tP@10: {:.3f}\trmse: {:.3f}".format(NDCG, rank_correlation, p_10, rmse))

writer.close()