In [1]:
import os
import numpy as np
import pandas as pd


# NCF

In [2]:
data_path='../../data/train'
train_df= pd.read_csv(os.path.join(data_path,'train_ratings.csv'))
train_df=train_df.drop(columns='time')
train_df['rating']=1

In [3]:
def sample_negative_items(data, seed, num_negative):
    """
    Parameters:
    - data (DataFrame): 전체 사용자-아이템 상호작용 데이터
    - seed (int): 랜덤 시드 값 (재현성을 위해 사용)
    - num_negative (int): 생성할 부정 샘플의 수

    Returns:
    - data_total (DataFrame): 부정 샘플이 포함된 전체 데이터프레임
    """
    rng = np.random.default_rng(seed)  # 난수 생성기를 시드와 함께 초기화(재현성 보장)
    items = set(data['item'].unique())  # 전체 아이템 집합
    total = []

    for user, group in data.groupby('user'):
        interacted_items = set(group['item'])  # 사용자가 이미 본 아이템 추출
        non_interacted_items = list(items - interacted_items)  # 사용자가 보지 않은 아이템 추출

        if len(non_interacted_items) < num_negative:
            sampled_items = non_interacted_items
        else:
            sampled_items = rng.choice(non_interacted_items, size=num_negative, replace=False)

        # 부정 샘플 데이터프레임 생성
        negative_samples = pd.DataFrame({
            'user': [user] * len(sampled_items),
            'item': sampled_items,
            'rating': [0] * len(sampled_items)  # 부정 샘플은 rating=0
        })

        total.append(negative_samples)

    # 원본 데이터와 부정 샘플을 합침
    data_total = pd.concat([data] + total, ignore_index=True)

    return data_total

In [4]:
data_total = sample_negative_items(train_df, 42, 50)

In [5]:
len(data_total)

6722471

In [6]:
data_path='../../data/train'
genres_df= pd.read_csv(os.path.join(data_path,'genres.tsv'),sep='\t')

In [7]:
genre2idx={}
for idx,genre in enumerate(set(genres_df['genre'])):
    genre2idx[genre]=idx+1

In [8]:
genres_df['genre']=genres_df['genre'].apply(lambda x: [genre2idx[x]])

In [9]:
item_lst=[]
group_lst=[]
for item,group in genres_df.groupby('item',sort=False):
    item_lst.append(item)
    group_lst.append(group['genre'].sum(axis=0))

In [10]:
A=pd.DataFrame(item_lst,columns=['item'])

In [11]:
B = pd.DataFrame({'genre': group_lst})

In [12]:
genre_df=pd.concat([A,B],axis=1)

In [13]:
data_total=pd.merge(data_total,genre_df,how='left',on='item')

In [14]:
data_total

Unnamed: 0,user,item,rating,genre
0,11,4643,1,"[10, 3, 12, 14]"
1,11,170,1,"[10, 3, 15, 2]"
2,11,531,1,"[18, 12]"
3,11,616,1,"[11, 18]"
4,11,2140,1,"[3, 13]"
...,...,...,...,...
6722466,138493,1840,0,[12]
6722467,138493,2533,0,"[10, 14]"
6722468,138493,91690,0,[9]
6722469,138493,8879,0,"[15, 4, 2]"


In [15]:
item2idx={}
user2idx={}
idx2item={}
idx2user={}
for idx,item in enumerate(np.sort(data_total['item'].unique())):
    item2idx[item]=idx
    idx2item[idx]=item
for idx,user in enumerate(np.sort(data_total['user'].unique())):
    user2idx[user]=idx
    idx2user[idx]=user
data_total['user']=data_total['user'].apply(lambda x: user2idx[x])
data_total['item']=data_total['item'].apply(lambda x: item2idx[x])


### NCF (deep) class

In [16]:
import torch
from torch import nn

class NCF_GM(nn.Module):
    def __init__(self, num_user, num_item, num_genres, embedding_dim=32, side_emb_dim=16, dropout_rate=0.2):
        super(NCF_GM, self).__init__()
        
        # Separate embeddings for NCF
        self.ncf_user_embedding = nn.Embedding(num_user, embedding_dim)
        self.ncf_item_embedding = nn.Embedding(num_item, embedding_dim)
        self.genre_embedding = nn.Embedding(num_genres, side_emb_dim, padding_idx=0)
        
        # Separate embeddings for GM
        self.gm_user_embedding = nn.Embedding(num_user, embedding_dim)
        self.gm_item_embedding = nn.Embedding(num_item, embedding_dim)
        
        # Fully connected layers for NCF
        self.fc1 = nn.Linear(2 * embedding_dim + side_emb_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        
        # Activation and dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, user_ids, item_ids, genres):
        # ===== GM Component =====
        # Embed users and items for GM
        gm_user_emb = self.gm_user_embedding(user_ids)  # Shape: (batch_size, embedding_dim)
        gm_item_emb = self.gm_item_embedding(item_ids)  # Shape: (batch_size, embedding_dim)
        
        # Compute the interaction (dot product or generalized interaction)
        gm_output = torch.sum(gm_user_emb * gm_item_emb, dim=1, keepdim=True)  # Shape: (batch_size, 1)

        # ===== NCF Component =====
        # Embed users and items for NCF
        ncf_user_emb = self.ncf_user_embedding(user_ids)  # Shape: (batch_size, embedding_dim)
        ncf_item_emb = self.ncf_item_embedding(item_ids)  # Shape: (batch_size, embedding_dim)
        
        # Embed genres and aggregate (mean)
        genre_emb = self.genre_embedding(genres)  # Shape: (batch_size, num_genres, side_emb_dim)
        genre_emb = genre_emb.mean(dim=1)  # Shape: (batch_size, side_emb_dim)

        # Concatenate embeddings
        ncf_input = torch.cat([ncf_user_emb, ncf_item_emb, genre_emb], dim=-1)  # Shape: (batch_size, 2*embedding_dim + side_emb_dim)
        
        # Pass through MLP layers
        x = self.relu(self.bn1(self.fc1(ncf_input)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        ncf_output = self.fc4(x)  # Shape: (batch_size, 1)

        # ===== Combine GM and NCF Outputs =====
        # Combine GM and NCF predictions
        output = torch.sigmoid(gm_output + ncf_output)  # Combine and apply sigmoid activation
        
        return output



### NCF DataSet

In [17]:
import torch
from torch.utils.data import Dataset

class RecommendationDataset(Dataset):
    def __init__(self, data):
        """
        Args:
            data (pd.DataFrame): A pandas DataFrame containing user, item, rating, and genre columns.
        """
        self.user_ids = torch.tensor(data['user'].values, dtype=torch.long)
        self.item_ids = torch.tensor(data['item'].values, dtype=torch.long)
        self.ratings = torch.tensor(data['rating'].values, dtype=torch.float32)
        self.genres = data['genre'].apply(lambda x: torch.tensor(x, dtype=torch.long)).tolist()  # List of tensors

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return {
            'user_id': self.user_ids[idx],
            'item_id': self.item_ids[idx],
            'rating': self.ratings[idx],
            'genre': self.genres[idx]  # A tensor of genre indices
        }
        

    

### NCF DataLoader

In [18]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    Custom collate function to pad genres and create a batch.
    Args:
        batch (list of dict): List of samples from the dataset.
    
    Returns:
        dict: A batch with padded genres.
    """
    user_ids = torch.stack([sample['user_id'] for sample in batch])
    item_ids = torch.stack([sample['item_id'] for sample in batch])
    ratings = torch.stack([sample['rating'] for sample in batch])

    # Pad the genre sequences
    genres = [sample['genre'] for sample in batch]  # List of genre tensors
    padded_genres = pad_sequence(genres, batch_first=True, padding_value=0)  # Padding with 0

    return {
        'user_id': user_ids,
        'item_id': item_ids,
        'rating': ratings,
        'genre': padded_genres
    }

In [19]:
len(genre2idx)

18

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Assume the dataset and model are defined
print('-----data loading------')
dataset = RecommendationDataset(data_total)  # Your dataset
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True, collate_fn=collate_fn, num_workers=4)

# Define the model
num_users = data_total['user'].nunique()
num_items = data_total['item'].nunique()
num_genres = len(genre2idx) +1   # Example: total number of unique genres + padding 0
embedding_dim = 32
side_emb_dim = 16
print('---model----')
model = NCF_GM(num_user=num_users, num_item=num_items, genre_item=num_genres, 
            embedding_dim=embedding_dim, side_emb_dim=side_emb_dim)

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for implicit feedback
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
num_epochs = 10
print('----train----')
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    
    for batch in tqdm(dataloader):
        # Move data to the same device as the model
        user_ids = batch['user_id'].to(device)
        item_ids = batch['item_id'].to(device)
        ratings = batch['rating'].to(device)  # Target values (1 for positive interaction, 0 for negative)
        genres = batch['genre'].to(device)   # Padded genre indices
    
        # Forward pass
        outputs = model(user_ids, item_ids, genres).squeeze()  # Predicted scores (sigmoid output)

        # Compute loss
        loss = criterion(outputs, ratings)

        # Backward pass
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update weights
        
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

-----data loading------
---model----


NameError: name 'NCF' is not defined

In [None]:
negative_df = data_total[data_total['rating'] == 0]
prediction_dataset = RecommendationDataset(negative_df)
prediction_loader = DataLoader(prediction_dataset, batch_size=1024, collate_fn=collate_fn)

In [None]:
model.eval()
predictions = []

with torch.no_grad():
    for user_id in tqdm(data_total['user'].unique()):
        # 유저가 본 영화 제외
        seen_items = set(data_total[data_total['user'] == user_id]['item'])
        all_items = set(data_total['item'].unique())
        unseen_items = list(all_items - seen_items)
        
        # 안본 영화 리스트 만들어놓기기
        predict_df = pd.DataFrame({
            'user': [user_id] * len(unseen_items),
            'item': unseen_items,
            'rating': [0] * len(unseen_items)  # Dummy ratings
        })
        
        # 장르 merge
        predict_df = pd.merge(predict_df, genre_df, how='left', on='item')
        
        # 만약 nan값 있을 경우 빈 리스트로 대체(없을거지만 혹시나해서)
        predict_df['genre'] = predict_df['genre'].apply(lambda x: x if isinstance(x, list) else [])
        
        # dataset, loader 만들기
        prediction_dataset = RecommendationDataset(predict_df)
        prediction_loader = DataLoader(prediction_dataset, batch_size=1024, collate_fn=collate_fn)
        
        user_predictions = []
        for batch in prediction_loader:
            # device에 몽땅 올려놓기
            user_ids = batch['user_id'].to(device)
            item_ids = batch['item_id'].to(device)
            genres = batch['genre'].to(device)
            
            # 여기서 예측함
            outputs = model(user_ids, item_ids, genres).squeeze()  # score가 생김
            
            if outputs.dim() == 0:  # output이 scalar인 경우 예외처리(batch size랑 유저 사이즈가 1차이날때 에러생기는거같음)
                outputs = outputs.unsqueeze(0)
            
            user_predictions.extend(zip(item_ids.cpu().numpy(), outputs.cpu().numpy()))
        
        # 상위 10개 뽑아내서 유저에 붙이기
        user_predictions = sorted(user_predictions, key=lambda x: x[1], reverse=True)[:10]
        predictions.extend([(user_id, item_id, score) for item_id, score in user_predictions])

predictions_df = pd.DataFrame(predictions, columns=['user', 'item', 'score'])

In [None]:
predictions_df[predictions_df['user']==0].sort_values(by='score')

In [None]:
user_lst = predictions_df['user'].tolist()
item_lst = predictions_df['item'].tolist()

In [None]:
mapped_user_lst = list(map(lambda x: idx2user[x], user_lst))
mapped_item_lst = list(map(lambda x :idx2item[x],item_lst))

In [None]:
df = pd.DataFrame(list(zip(mapped_user_lst, mapped_item_lst)), columns=['user', 'item'])

In [None]:
df.to_csv('output.csv', index=False)