# Implementação do modelo NCF: GMF, MLP e NeuMF

In [7]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

from sklearn.model_selection import train_test_split

## Importação da base de dados MovieLens 100k

In [8]:
import os
import urllib.request
import zipfile

# URL do dataset MovieLens 100k
url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
dataset_dir = "ml-100k"

# Nome do arquivo que será baixado
filename = "ml-100k.zip"

# Verifica se o diretório já existe
if not os.path.exists(dataset_dir):
    # Faz o download do arquivo zip
    print("Baixando o dataset MovieLens 100k...")
    urllib.request.urlretrieve(url, filename)
    
    # Descompacta o arquivo zip
    print("Descompactando o arquivo...")
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall()

    # Remove o arquivo zip após descompactar
    os.remove(filename)

    print(f"Dataset MovieLens 100k baixado e descompactado na pasta '{dataset_dir}'.")
else:
    print(f"O dataset já existe na pasta '{dataset_dir}'.")


Baixando o dataset MovieLens 100k...
Descompactando o arquivo...
Dataset MovieLens 100k baixado e descompactado na pasta 'ml-100k'.


## Implementação das classes GMF e MLP

In [44]:
import torch
import torch.nn as nn

class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        nn.Module.__init__(self)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
                 
        # COMENTAR SE FOR USAR NeuMF       
        # Definir a camada de saída
        self.output = nn.Linear(embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()
        # COMENTAR SE FOR USAR NeuMF
        
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        
    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        
        output = user_emb * item_emb  # Produto de Hadamard
        
        # COMENTAR SE FOR USAR NeuMF
        # Passar pela camada de saída e aplicar sigmoid
        output = self.sigmoid(self.output(output))
        # COMENTAR SE FOR USAR NeuMF
        
        return output


class MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_layers=[64, 32, 16, 8]):
        nn.Module.__init__(self)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
        
        layers = []
        input_size = embedding_dim * 2
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.ReLU())
            input_size = hidden_size
        
        self.mlp_layers = nn.Sequential(*layers)
    
    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        mlp_input = torch.cat([user_emb, item_emb], dim=-1)
        mlp_output = self.mlp_layers(mlp_input)
        return mlp_output


## Implementação da classe NeuMF

In [40]:
class NeuMF(GMF, MLP):
    def __init__(self, num_users, num_items, embedding_dim, hidden_layers=[64, 32, 16, 8], pretrained_gmf=None, pretrained_mlp=None):
        nn.Module.__init__(self)  # Inicializa o nn.Module, necessário em herança múltipla
        GMF.__init__(self, num_users, num_items, embedding_dim)
        MLP.__init__(self, num_users, num_items, embedding_dim, hidden_layers)
        
        # Camada final para combinar as saídas do GMF e MLP
        combined_input_size = embedding_dim + hidden_layers[-1]
        self.output_layer = nn.Linear(combined_input_size, 1)
        self.sigmoid = nn.Sigmoid()
        
        # Carregar pesos pré-treinados se fornecidos
        if pretrained_gmf:
            self.gmf_user_embedding.weight.data.copy_(pretrained_gmf.user_embedding.weight)
            self.gmf_item_embedding.weight.data.copy_(pretrained_gmf.item_embedding.weight)
        if pretrained_mlp:
            self.mlp_user_embedding.weight.data.copy_(pretrained_mlp.user_embedding.weight)
            self.mlp_item_embedding.weight.data.copy_(pretrained_mlp.item_embedding.weight)
            self.mlp_layers.load_state_dict(pretrained_mlp.mlp_layers.state_dict())
    
    def forward(self, user, item):
        # GMF forward pass
        gmf_output = GMF.forward(self, user, item)
        
        # MLP forward pass
        mlp_output = MLP.forward(self, user, item)
        
        # Concatenar as saídas de GMF e MLP
        combined_output = torch.cat([gmf_output, mlp_output], dim=-1)
        
        # Passar pela camada de saída e aplicar sigmoid
        output = self.sigmoid(self.output_layer(combined_output))
        
        return output


## Carregando os dados

In [29]:
# Certifique-se de ajustar o caminho para o local onde você salvou o arquivo u.data
df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Remover a coluna de timestamp, pois não é necessária
df = df.drop('timestamp', axis=1)

# Converter IDs para índices consecutivos (necessário para o PyTorch)
df['user_id'] = df['user_id'].astype('category').cat.codes.values
df['item_id'] = df['item_id'].astype('category').cat.codes.values

# Converter as avaliações para binárias (1 se rating >= 4, caso contrário 0)
df['rating'] = (df['rating'] >= 4).astype(int)

# Dividir os dados em conjuntos de treino e teste
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Converter para tensores PyTorch e garantir que sejam do tipo LongTensor
train_users = torch.tensor(train_data['user_id'].values, dtype=torch.long)
train_items = torch.tensor(train_data['item_id'].values, dtype=torch.long)
train_ratings = torch.tensor(train_data['rating'].values).float()

test_users = torch.tensor(test_data['user_id'].values, dtype=torch.long)
test_items = torch.tensor(test_data['item_id'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values).float()

# Criar DataLoader para facilitar o treinamento em mini-batches
train_dataset = data.TensorDataset(train_users, train_items, train_ratings)
train_loader = data.DataLoader(train_dataset, batch_size=256, shuffle=True)

## Treinamento do modelo

In [46]:
# Definir os hiperparâmetros
num_users = df['user_id'].nunique()
num_items = df['item_id'].nunique()
embedding_dim = 20  # Dimensão das embeddings
hidden_layers = [64, 32, 16, 8]  # Estrutura das camadas ocultas do MLP
learning_rate = 0.001
num_epochs = 20

# Instanciar o modelo NeuMF
#model = NeuMF(num_users, num_items, embedding_dim, hidden_layers)

model = GMF(num_users, num_items, embedding_dim)

# Definir o otimizador e a função de perda
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCELoss()

# Treinamento do modelo
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        
        # Passar os dados pelo modelo
        prediction = model(user, item)
        loss = criterion(prediction.squeeze(), rating)
        
        # Backpropagation e atualização dos pesos
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')


Epoch 1/20, Loss: 0.6917
Epoch 2/20, Loss: 0.6415
Epoch 3/20, Loss: 0.5670
Epoch 4/20, Loss: 0.5319
Epoch 5/20, Loss: 0.5137
Epoch 6/20, Loss: 0.4999
Epoch 7/20, Loss: 0.4876
Epoch 8/20, Loss: 0.4758
Epoch 9/20, Loss: 0.4636
Epoch 10/20, Loss: 0.4509
Epoch 11/20, Loss: 0.4371
Epoch 12/20, Loss: 0.4222
Epoch 13/20, Loss: 0.4062
Epoch 14/20, Loss: 0.3894
Epoch 15/20, Loss: 0.3723
Epoch 16/20, Loss: 0.3551
Epoch 17/20, Loss: 0.3387
Epoch 18/20, Loss: 0.3227
Epoch 19/20, Loss: 0.3082
Epoch 20/20, Loss: 0.2945


## Avaliação no conjunto de teste

In [48]:
model.eval()
with torch.no_grad():
    test_prediction = model(test_users, test_items)
    test_loss = criterion(test_prediction.squeeze(), test_ratings)
    print(f'Test Loss: {test_loss.item():.4f}')    

Test Loss: 0.6902
