In [100]:
import argparse
import os
import time
import re
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
import bottleneck as bn
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [101]:
# Argument parser 설정
parser = argparse.ArgumentParser(description='PyTorch Conditional Variational Autoencoders for Collaborative Filtering')
parser.add_argument('--lr', type=float, default=1e-5,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=1e-5,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=100,
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=100000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.5,
                    help='largest annealing parameter')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
args = parser.parse_args([])

In [102]:
train_df = pd.read_csv('/data/ephemeral/home/data/train/train_ratings.csv')

titles = pd.read_csv('/data/ephemeral/home/data/train/titles.tsv', sep='\t')
years = pd.read_csv('/data/ephemeral/home/data/train/years.tsv', sep='\t')
directors = pd.read_csv('/data/ephemeral/home/data/train/directors.tsv', sep='\t')
genres = pd.read_csv('/data/ephemeral/home/data/train/genres.tsv', sep='\t')
writers = pd.read_csv('/data/ephemeral/home/data/train/writers.tsv', sep='\t')

In [103]:
# 제목에서 연도 추출 함수 정의
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))
    else:
        return np.nan  # 연도를 찾지 못한 경우 NaN 반환

# 제목 데이터에서 연도 추출
titles['year_extracted'] = titles['title'].apply(extract_year)

# 기존 연도 데이터와 추출한 연도 데이터를 병합
years = years.merge(titles[['item', 'year_extracted']], on='item', how='left')
years['year'] = years['year'].fillna(years['year_extracted'])
years = years.drop(columns=['year_extracted'])


In [104]:
def reindex_column(data, column_name):
    """
    Reindex a column in the dataframe to ensure continuous indices starting from 0.

    Parameters:
    - data: pd.DataFrame, the input dataframe.
    - column_name: str, the column to reindex.

    Returns:
    - data: pd.DataFrame, the dataframe with reindexed column.
    - mapping_dict: dict, the original-to-new mapping dictionary.
    """
    # Create the mapping dictionary
    mapping_dict = {original_id: new_id for new_id, original_id in enumerate(data[column_name].unique())}

    # Apply the mapping to the dataframe
    data[column_name] = data[column_name].map(mapping_dict)

    return data, mapping_dict

In [105]:
# train_df에 인덱스 매핑 적용
original_df = train_df.copy()
train_df, usr2idx_dict = reindex_column(train_df, 'user')
train_df, item2idx_dict = reindex_column(train_df, 'item')

# 다른 데이터프레임에 아이템 인덱스 매핑 적용
genres['item'] = genres['item'].map(item2idx_dict)
titles['item'] = titles['item'].map(item2idx_dict)
years['item'] = years['item'].map(item2idx_dict)
directors['item'] = directors['item'].map(item2idx_dict)
writers['item'] = writers['item'].map(item2idx_dict)

# 학습 데이터에 없는 아이템 제거 및 타입 변환
dataframes = [genres, titles, years, directors, writers]
for df in dataframes:
    df.dropna(subset=['item'], inplace=True)
    df['item'] = df['item'].astype(int)

# **3. 상호작용 행렬 생성**

# csr_matrix 생성에 필요한 요소 추출
row = train_df['user'].values
col = train_df['item'].values
data = np.ones_like(row)

# 사용자 수와 아이템 수
num_users = train_df['user'].nunique()
num_items = train_df['item'].nunique()

# 상호작용 행렬 생성
interaction_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))


In [106]:
def train_validation_split(interaction_matrix):
    """
    Split the interaction matrix into training and validation sets.
    The validation set will include both sequential and static items in a 1:1 ratio.
    
    Parameters:
    - interaction_matrix: csr_matrix, the full user-item interaction matrix.
    
    Returns:
    - train_matrix: csr_matrix, training set interactions.
    - validation_matrix: csr_matrix, validation set interactions.
    """
    train_rows, train_cols, train_data = [], [], []
    val_rows, val_cols, val_data = [], [], []
    
    for user in range(interaction_matrix.shape[0]):
        item_indices = interaction_matrix[user].nonzero()[1]
        timestamps = interaction_matrix[user].data  # 각 상호작용의 타임스탬프
        
        if len(item_indices) == 0:
            continue  # 상호작용이 없는 사용자 건너뜀
        
        # 아이템과 타임스탬프를 함께 정렬 (타임스탬프 기준)
        sorted_items = [x for _, x in sorted(zip(timestamps, item_indices))]
        
        num_items = len(sorted_items)
        num_val_items = max(2, int(0.2 * num_items))  # 최소 2개의 검증 아이템 (순차적, 정적)
        
        # Sequential 아이템: 마지막 아이템
        sequential_item = sorted_items[-1]
        
        # Static 아이템: 나머지 아이템 중에서 무작위로 선택
        remaining_items = sorted_items[:-1]
        if len(remaining_items) > 0:
            num_static_items = num_val_items // 2
            static_items = np.random.choice(remaining_items, size=num_static_items, replace=False).tolist()
        else:
            static_items = []
        
        # 검증 세트 아이템 (순차적 + 정적)
        val_items = [sequential_item] + static_items
        
        # 학습 세트 아이템
        train_items = list(set(sorted_items) - set(val_items))
        
        # 학습 세트 데이터 추가
        train_rows.extend([user] * len(train_items))
        train_cols.extend(train_items)
        train_data.extend([1] * len(train_items))
        
        # 검증 세트 데이터 추가
        val_rows.extend([user] * len(val_items))
        val_cols.extend(val_items)
        val_data.extend([1] * len(val_items))
    
    # CSR 행렬 생성
    train_matrix = csr_matrix((train_data, (train_rows, train_cols)), shape=interaction_matrix.shape)
    validation_matrix = csr_matrix((val_data, (val_rows, val_cols)), shape=interaction_matrix.shape)
    
    return train_matrix, validation_matrix

In [108]:
train_data, val_data = train_validation_split(interaction_matrix)


In [109]:
# 모든 아이템 ID 리스트 생성
all_items = pd.DataFrame({'item': np.arange(num_items)})

# 장르 처리
item_genres = genres.groupby('item')['genre'].apply(list).reset_index()
item_genres = all_items.merge(item_genres, on='item', how='left')
item_genres['genre'] = item_genres['genre'].apply(lambda x: x if isinstance(x, list) else [])
mlb_genres = MultiLabelBinarizer()
genre_matrix = mlb_genres.fit_transform(item_genres['genre'])

# 감독 처리
item_directors = directors.groupby('item')['director'].apply(list).reset_index()
item_directors = all_items.merge(item_directors, on='item', how='left')
item_directors['director'] = item_directors['director'].apply(lambda x: x if isinstance(x, list) else [])
mlb_directors = MultiLabelBinarizer()
director_matrix = mlb_directors.fit_transform(item_directors['director'])

# 작가 처리
item_writers = writers.groupby('item')['writer'].apply(list).reset_index()
item_writers = all_items.merge(item_writers, on='item', how='left')
item_writers['writer'] = item_writers['writer'].apply(lambda x: x if isinstance(x, list) else [])
mlb_writers = MultiLabelBinarizer()
writer_matrix = mlb_writers.fit_transform(item_writers['writer'])

# 연도 처리
item_years = years[['item', 'year']]
item_years = all_items.merge(item_years, on='item', how='left')
# 결측값을 중간값으로 대체
median_year = item_years['year'].median()
item_years['year'] = item_years['year'].fillna(median_year)

In [110]:
# 스케일링
scaler_year = StandardScaler()
item_years['year_scaled'] = scaler_year.fit_transform(item_years[['year']])
year_matrix = item_years[['year_scaled']].values

# **6. 특징 행렬 결합**

# 특징 행렬 연결
item_feature_matrix = np.hstack([
    genre_matrix,
    director_matrix,
    writer_matrix,
    year_matrix
])

In [111]:
# **7. 조건부 변수 딕셔너리 생성**

# 아이템별 조건부 변수 딕셔너리 생성
item_conditional = {item: torch.FloatTensor(features) for item, features in zip(all_items['item'], item_feature_matrix)}

# 조건부 변수의 차원 및 기본값 설정
cond_dim = item_feature_matrix.shape[1]
default_cond = torch.zeros(cond_dim)


In [112]:
# CVAE 모델 정의
class MultiCVAE(nn.Module):
    """
    Container module for Multi-CVAE.

    Multi-CVAE : Conditional Variational Autoencoder with Multinomial Likelihood

    """
    def __init__(self, p_dims, q_dims=None, cond_dim=0, dropout=0.5):
        super(MultiCVAE, self).__init__()
        self.p_dims = p_dims
        self.cond_dim = cond_dim  # 조건부 변수의 차원

        if q_dims:
            assert q_dims[0] == p_dims[-1], "Input and output dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        # 인코더 레이어 설정
        temp_q_dims = [self.q_dims[0] + cond_dim] + self.q_dims[1:-1] + [self.q_dims[-1] * 2]
        self.q_layers = nn.ModuleList()
        for d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:]):
            self.q_layers.append(nn.Linear(d_in, d_out))

        # 디코더 레이어 설정
        temp_p_dims = [self.p_dims[0]] + self.p_dims[1:]
        self.p_layers = nn.ModuleList()
        for i, (d_in, d_out) in enumerate(zip(temp_p_dims[:-1], temp_p_dims[1:])):
            if i > 0:
                d_in += cond_dim  # 조건부 변수의 차원을 입력 차원에 더해줌
            self.p_layers.append(nn.Linear(d_in, d_out))

        self.drop = nn.Dropout(dropout)
        self.init_weights()

    def forward(self, input, cond):
        mu, logvar = self.encode(input, cond)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, cond), mu, logvar

    def encode(self, input, cond):
        h = F.normalize(input)
        h = self.drop(h)
        h = torch.cat([h, cond], dim=1)  # 조건부 변수와 결합

        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]
        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + eps * std
        else:
            return mu

    def decode(self, z, cond):
        h = z
        for i, layer in enumerate(self.p_layers):
            if i > 0:
                h = torch.cat([h, cond], dim=1)  # 조건부 변수와 결합
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0 / (fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0 / (fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

In [113]:
def loss_function_vae(recon_x, x, mu, logvar, anneal=1.0):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

    return BCE + anneal * KLD

In [114]:
def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [115]:
def get_conditional_variable(data_batch):
    """
    각 사용자에 대한 조건부 변수를 생성합니다.
    사용자별로 본 아이템들의 조건부 변수 평균을 계산합니다.
    """
    cond_list = []
    for user_interactions in data_batch:
        # user_interactions는 scipy.sparse의 행 벡터입니다.
        item_indices = user_interactions.nonzero()[1]  # 열 인덱스를 가져옵니다.
        item_indices = item_indices.tolist()
        item_conds = [item_conditional.get(item, default_cond) for item in item_indices]
        if item_conds:
            cond = torch.stack(item_conds).mean(dim=0)
        else:
            cond = default_cond
        cond_list.append(cond)
    cond_tensor = torch.stack(cond_list)
    return cond_tensor

In [116]:
def train(model, criterion, optimizer, train_data, device, args):
    """
    Train the model on the given training data.

    Parameters:
    - model: PyTorch model
    - criterion: Loss function
    - optimizer: Optimizer for training
    - train_data: csr_matrix, training data
    - device: PyTorch device (e.g., 'cuda' or 'cpu')
    - args: Arguments containing hyperparameters

    Returns:
    - train_loss: Average training loss over all batches
    """
    # Turn on training mode
    model.train()
    train_loss = 0.0
    start_time = time.time()
    global update_count

    for batch_idx, start_idx in enumerate(range(0, train_data.shape[0], args.batch_size)):
        end_idx = min(start_idx + args.batch_size, train_data.shape[0])
        data = train_data[start_idx:end_idx]
        data_tensor = naive_sparse2tensor(data).to(device)
        optimizer.zero_grad()

        # 조건부 변수 생성
        cond = get_conditional_variable(data)
        cond = cond.to(device)

        if args.total_anneal_steps > 0:
            anneal = min(args.anneal_cap, 1. * update_count / args.total_anneal_steps)
        else:
            anneal = args.anneal_cap

        # Forward pass and loss computation
        recon_batch, mu, logvar = model(data_tensor, cond)
        loss = loss_function_vae(recon_batch, data_tensor, mu, logvar, anneal)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        update_count += 1

        # Log progress
        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                  'loss {:4.2f}'.format(
                      epoch, batch_idx, len(range(0, train_data.shape[0], args.batch_size)),
                      elapsed * 1000 / args.log_interval,
                      train_loss / args.log_interval))
            start_time = time.time()
            train_loss = 0.0

    # Return average training loss
    return train_loss / len(range(0, train_data.shape[0], args.batch_size))

In [117]:
def evaluate(model, train_data, validation_data, loss_function, device, top_k=[10], anneal_cap=0.2, total_anneal_steps=20000):
    """
    Evaluate the Multi-CVAE model on validation data.

    Parameters:
    - model: Multi-CVAE model instance
    - train_data: csr_matrix, user-item interactions in the training set
    - validation_data: csr_matrix, user-item interactions in the validation set
    - loss_function: Loss function for VAE (e.g., loss_function_vae)
    - device: PyTorch device (e.g., 'cuda' or 'cpu')
    - top_k: List of k values for Recall@k and NDCG@k
    - anneal_cap: Maximum annealing factor for KL divergence
    - total_anneal_steps: Total steps for annealing KL divergence weight

    Returns:
    - avg_loss: Average evaluation loss
    - metrics: Dictionary containing Recall@k and NDCG@k for each k
    """
    model.eval()
    total_loss = 0.0
    update_count = 0
    anneal = 0.0
    recall_results = {k: [] for k in top_k}
    ndcg_results = {k: [] for k in top_k}

    num_users = validation_data.shape[0]

    with torch.no_grad():
        for start_idx in range(0, num_users, args.batch_size):
            end_idx = min(start_idx + args.batch_size, num_users)
            data_batch = train_data[start_idx:end_idx]
            data_tensor = naive_sparse2tensor(data_batch).to(device)
            val_batch = validation_data[start_idx:end_idx]
            val_tensor = naive_sparse2tensor(val_batch).to(device)

            # 조건부 변수 생성
            cond = get_conditional_variable(data_batch)
            cond = cond.to(device)

            # Annealing factor for KL divergence
            if total_anneal_steps > 0:
                anneal = min(anneal_cap, 1.0 * update_count / total_anneal_steps)

            # Forward pass
            recon_batch, mu, logvar = model(data_tensor, cond)

            # Compute loss (using validation data)
            loss = loss_function(recon_batch, val_tensor, mu, logvar, anneal)
            total_loss += loss.item()

            # Exclude training interactions from recommendations
            recon_batch = recon_batch.cpu().numpy()
            data_batch = data_batch.toarray()
            recon_batch[data_batch.nonzero()] = -np.inf  # Mask training items

            val_tensor = val_tensor.cpu().numpy()

            # Compute metrics for top_k
            for k in top_k:
                recall = Recall_at_k_batch(recon_batch, val_tensor, k)
                ndcg = NDCG_binary_at_k_batch(recon_batch, val_tensor, k)
                recall_results[k].extend(recall)
                ndcg_results[k].extend(ndcg)

            update_count += 1

    # Compute average loss and metrics
    avg_loss = total_loss / num_users
    metrics = {
        f"Recall@{k}": np.mean(recall_results[k]) for k in top_k
    }
    metrics.update({
        f"NDCG@{k}": np.mean(ndcg_results[k]) for k in top_k
    })

    print(f"Evaluation Loss: {avg_loss:.4f}")
    for k in top_k:
        print(f"Recall@{k}: {metrics[f'Recall@{k}']:.4f}, NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")

    return avg_loss, metrics

In [118]:
def Recall_at_k_batch(X_pred, heldout_batch, k=10):
    """
    Compute Recall@k for binary relevance.

    Parameters:
    - X_pred: numpy.ndarray, predicted scores for all items
    - heldout_batch: numpy.ndarray or csr_matrix, true interactions for each user
    - k: int, cutoff for Recall@k

    Returns:
    - recall: numpy.ndarray, Recall@k for each user in the batch
    """
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    # Convert heldout_batch to dense array if it's a sparse matrix
    if isinstance(heldout_batch, np.ndarray):
        X_true_binary = heldout_batch > 0
    else:
        X_true_binary = (heldout_batch > 0).toarray()

    # Compute Recall@k
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

In [119]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    """
    Compute Normalized Discounted Cumulative Gain@k for binary relevance.

    Parameters:
    - X_pred: numpy.ndarray, predicted scores for all items
    - heldout_batch: numpy.ndarray or csr_matrix, true interactions for each user
    - k: int, cutoff for NDCG@k

    Returns:
    - ndcg: numpy.ndarray, NDCG@k for each user in the batch
    """
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    # Convert heldout_batch to dense array if it's a sparse matrix
    if isinstance(heldout_batch, np.ndarray):
        X_true_binary = heldout_batch > 0
    else:
        X_true_binary = (heldout_batch > 0).toarray()

    # Compute DCG
    tp = 1. / np.log2(np.arange(2, k + 2))
    DCG = (X_true_binary[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1)

    # Compute IDCG
    IDCG = np.array([tp[:min(n, k)].sum() for n in X_true_binary.sum(axis=1)])
    ndcg = DCG / IDCG
    ndcg[np.isnan(ndcg)] = 0.0  # Handle NaN for users with no interactions
    return ndcg

In [120]:
N = train_data.shape[0]

# Hyperparameters
learning_rate = 1e-3  # 학습률 증가
anneal_cap = 0.5
total_anneal_steps = 10000
epochs = args.epochs

# args 객체에 하이퍼파라미터 업데이트
args.lr = learning_rate
args.anneal_cap = anneal_cap
args.total_anneal_steps = total_anneal_steps
args.batch_size = 500  # 필요한 경우 배치 크기 설정

# Initialize model, optimizer, and loss function
input_dim = train_data.shape[1]
latent_dim = 800
p_dims = [latent_dim, 800, input_dim]
cond_dim = item_feature_matrix.shape[1]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiCVAE(p_dims, cond_dim=cond_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# 학습률 스케줄러 설정 (중복 정의 제거)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # 5 에포크마다 학습률 50% 감소

# Early Stopping 관련 변수 초기화
best_val_loss = float('inf')
patience = 3
trigger_times = 0

update_count = 0  # 업데이트 횟수 초기화


In [121]:
for epoch in range(1, epochs + 1):
    print(f"Epoch {epoch}/{epochs}")
    
    # 훈련 단계
    train_loss = train(model, loss_function_vae, optimizer, train_data, device, args)
    print(f"Training Loss: {train_loss:.4f}")

    # 검증 단계
    val_loss, metrics = evaluate(model, train_data, val_data, loss_function_vae, device, top_k=[10])
    print(f"Validation Loss: {val_loss:.4f}")
    for k in [10]:
        print(f"Recall@{k}: {metrics[f'Recall@{k}']:.4f}, NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")
    
    # 학습률 스케줄러 업데이트
    scheduler.step()

    # Early Stopping 체크
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print("Best model saved.")
    else:
        trigger_times += 1
        print(f"Validation loss did not improve for {trigger_times} epoch(s).")
        
        if trigger_times >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

Epoch 1/100
Training Loss: 1155.6392
Evaluation Loss: 0.2658
Recall@10: 0.1431, NDCG@10: 0.1577
Validation Loss: 0.2658
Recall@10: 0.1431, NDCG@10: 0.1577
Best model saved.
Epoch 2/100
Training Loss: 1097.0309
Evaluation Loss: 0.2588
Recall@10: 0.1695, NDCG@10: 0.1844
Validation Loss: 0.2588
Recall@10: 0.1695, NDCG@10: 0.1844
Best model saved.
Epoch 3/100
Training Loss: 1074.0266
Evaluation Loss: 0.2546
Recall@10: 0.1906, NDCG@10: 0.2062
Validation Loss: 0.2546
Recall@10: 0.1906, NDCG@10: 0.2062
Best model saved.
Epoch 4/100
Training Loss: 1059.8747
Evaluation Loss: 0.2524
Recall@10: 0.1959, NDCG@10: 0.2094
Validation Loss: 0.2524
Recall@10: 0.1959, NDCG@10: 0.2094
Best model saved.
Epoch 5/100
Training Loss: 1051.1633
Evaluation Loss: 0.2511
Recall@10: 0.2018, NDCG@10: 0.2147
Validation Loss: 0.2511
Recall@10: 0.2018, NDCG@10: 0.2147
Best model saved.
Epoch 6/100
Training Loss: 1045.7224
Evaluation Loss: 0.2502
Recall@10: 0.2120, NDCG@10: 0.2284
Validation Loss: 0.2502
Recall@10: 0.21

In [122]:
# 1. 최적 모델 로드
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

# 2. 전체 사용자 데이터 준비
num_users = train_df['user'].nunique()
num_items = train_df['item'].nunique()

row = train_df['user'].values
col = train_df['item'].values
data = np.ones_like(row)
full_interaction_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))

# 3. 조건부 변수 생성
def get_conditional_variable_all_users(data_matrix):
    cond_list = []
    num_users = data_matrix.shape[0]
    for idx in range(num_users):
        user_interactions = data_matrix.getrow(idx)
        item_indices = user_interactions.nonzero()[1].tolist()
        item_conds = [item_conditional.get(item, default_cond) for item in item_indices]
        if item_conds:
            cond = torch.stack(item_conds).mean(dim=0)
        else:
            cond = default_cond
        cond_list.append(cond.numpy())
    cond_tensor = torch.FloatTensor(np.array(cond_list))
    return cond_tensor

user_conditional = get_conditional_variable_all_users(full_interaction_matrix)
user_conditional = user_conditional.to(device)

# 4. 예측 생성
model.eval()
with torch.no_grad():
    data_tensor = naive_sparse2tensor(full_interaction_matrix).to(device)
    batch_size = args.batch_size
    num_users = data_tensor.shape[0]
    all_reconstructions = []

    for start_idx in range(0, num_users, batch_size):
        end_idx = min(start_idx + batch_size, num_users)
        batch_data = data_tensor[start_idx:end_idx]
        batch_cond = user_conditional[start_idx:end_idx]
        recon_batch, mu, logvar = model(batch_data, batch_cond)
        recon_batch = recon_batch.cpu()
        all_reconstructions.append(recon_batch)

    reconstructions = torch.cat(all_reconstructions, dim=0)

# 5. 이미 본 아이템 제외
reconstructions = reconstructions.numpy()
data_array = full_interaction_matrix.toarray()
reconstructions[data_array.nonzero()] = -np.inf

# 6. 상위 N개 아이템 선택
top_N = 10
recommendations = np.argpartition(-reconstructions, top_N, axis=1)[:, :top_N]
sorted_recommendations = np.argsort(-reconstructions[np.arange(num_users)[:, None], recommendations], axis=1)
top_items = recommendations[np.arange(num_users)[:, None], sorted_recommendations]

# 7. 제출 파일 작성
users = np.repeat(np.arange(num_users), top_N)
items = top_items.flatten()

idx2usr_dict = {v: k for k, v in usr2idx_dict.items()}
idx2item_dict = {v: k for k, v in item2idx_dict.items()}

users = [idx2usr_dict[u] for u in users]
items = [idx2item_dict[i] for i in items]

submission_df = pd.DataFrame({'user': users, 'item': items})
submission_df.to_csv("/data/ephemeral/home/code/output/CVAE_1:1_submission.csv", index=False)