In [90]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader,Sampler
from torch.sparse import mm
from scipy.sparse import csr_matrix
import torch.nn.functional as F
import time
import argparse
import bottleneck as bn

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f4f7a55fcf0>

In [5]:
!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000339/data/data.tar.gz
!tar -xf data.tar.gz

--2024-11-15 14:04:40--  https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000339/data/data.tar.gz
Resolving aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)... 52.219.202.15, 3.5.186.37, 52.219.146.2, ...
Connecting to aistages-api-public-prod.s3.amazonaws.com (aistages-api-public-prod.s3.amazonaws.com)|52.219.202.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33425907 (32M) [binary/octet-stream]
Saving to: ‘data.tar.gz’


2024-11-15 14:04:45 (9.53 MB/s) - ‘data.tar.gz’ saved [33425907/33425907]

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.d

In [39]:
parser = argparse.ArgumentParser(description='PyTorch Variational Autoencoders for Collaborative Filtering')
parser.add_argument('--lr', type=float, default=1e-4,
                    help='initial learning rate')
parser.add_argument('--wd', type=float, default=0.00,
                    help='weight decay coefficient')
parser.add_argument('--batch_size', type=int, default=500,
                    help='batch size')
parser.add_argument('--epochs', type=int, default=20,
                    help='upper epoch limit')
parser.add_argument('--total_anneal_steps', type=int, default=200000,
                    help='the total number of gradient updates for annealing')
parser.add_argument('--anneal_cap', type=float, default=0.2,
                    help='largest annealing parameter')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log_interval', type=int, default=100, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
args = parser.parse_args([])


### 데이터셋 불러오기

In [6]:
data_path = ('data/')
train_df = pd.read_csv(os.path.join(data_path, 'train/train_ratings.csv'))


### 기본적인 정보 확인

In [7]:
train_df.shape

(5154471, 3)

In [8]:
train_df.head(5)

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563


In [9]:
train_df.isnull().sum()

Unnamed: 0,0
user,0
item,0
time,0


In [10]:
train_df['user'].nunique()

31360

In [11]:
train_df['item'].nunique()

6807

### 데이터 전처리

ALS 에 input으로 넣어줄 수 있게 csr matrix 형태로 바꿔줍니다

In [12]:
# reindexing function

def reindex_column(data, column_name):
    """
    Reindex a column in the dataframe to ensure continuous indices starting from 0.

    Parameters:
    - data: pd.DataFrame, the input dataframe.
    - column_name: str, the column to reindex.

    Returns:
    - data: pd.DataFrame, the dataframe with reindexed column.
    - mapping_dict: dict, the original-to-new mapping dictionary.
    """
    # Create the mapping dictionary
    mapping_dict = {original_id: new_id for new_id, original_id in enumerate(data[column_name].unique())}

    # Apply the mapping to the dataframe
    data[column_name] = data[column_name].map(mapping_dict)

    return data, mapping_dict


In [13]:
# index 조정
original_df = train_df.copy()
train_df,usr2idx_dict = reindex_column(train_df,'user')
train_df,item2idx_dict = reindex_column(train_df,'item')

In [43]:
# csr 구성하는 row column define
row,column = train_df['user'],train_df['item']
feedback = np.ones_like(row)

In [59]:
# csr matrix 생성
num_users,num_items = train_df['user'].nunique(),train_df['item'].nunique()

interaction_matrix = csr_matrix((feedback,(row,column)),dtype = 'float64',shape = (num_users,num_items))

In [63]:
interaction_matrix

<31360x6807 sparse matrix of type '<class 'numpy.float64'>'
	with 5154471 stored elements in Compressed Sparse Row format>

Train/val split

In [None]:
def train_validation_split(interaction_matrix, valid_ratio=0.1):
    """
    Split a CSR interaction matrix into training and validation sets with specific rules:
    - If valid_ratio == 0, all interactions are included in the training set.
    - Otherwise:
        - Last interaction is always included in the validation set.
        - Additional interactions are randomly selected (if available).
        - Validation set size is capped at valid_ratio of the user's total interactions.

    Parameters:
    - interaction_matrix: csr_matrix, the full user-item interaction matrix.
    - valid_ratio: float, fraction of interactions to include in the validation set.

    Returns:
    - train_matrix: csr_matrix, training set interactions.
    - validation_matrix: csr_matrix, validation set interactions.
    """
    if valid_ratio == 0:
        # Use the entire matrix as the training set and return an empty validation set
        train_matrix = interaction_matrix.copy()
        return train_matrix

    train_rows, train_cols, train_data = [], [], []
    val_rows, val_cols, val_data = [], [], []

    # Iterate over each user in the interaction matrix
    for user in range(interaction_matrix.shape[0]):
        # Get the non-zero interactions (item indices) for this user
        item_indices = interaction_matrix[user].nonzero()[1]
        num_items = len(item_indices)

        if num_items == 0:
            continue  # Skip users with no interactions

        # Determine the maximum validation set size (at least 1 item)
        max_val_items = max(1, int(valid_ratio * num_items))

        # Last interaction is always included in the validation set
        val_items = [item_indices[-1]]

        # Randomly sample additional items from the remaining interactions
        remaining_items = item_indices[:-1]
        if len(remaining_items) > 0:
            num_random_items = min(max_val_items - 1, len(remaining_items))
            random_items = np.random.choice(remaining_items, size=num_random_items, replace=False)
            val_items.extend(random_items)

        # Add the remaining items to the training set
        train_items = list(set(item_indices) - set(val_items))

        # Add training interactions
        train_rows.extend([user] * len(train_items))
        train_cols.extend(train_items)
        train_data.extend([1] * len(train_items))

        # Add validation interactions
        val_rows.extend([user] * len(val_items))
        val_cols.extend(val_items)
        val_data.extend([1] * len(val_items))

    # Create CSR matrices for training and validation
    train_matrix = csr_matrix((train_data, (train_rows, train_cols)), dtype='float64', shape=interaction_matrix.shape)
    validation_matrix = csr_matrix((val_data, (val_rows, val_cols)), dtype='float64', shape=interaction_matrix.shape)

    return train_matrix, validation_matrix

In [65]:
train_data,val_data = train_validation_split(interaction_matrix,valid_ratio=0.2)

In [67]:
val_data

<31360x6807 sparse matrix of type '<class 'numpy.float64'>'
	with 94080 stored elements in Compressed Sparse Row format>

### 모델정의

In [19]:
class MultiVAE(nn.Module):
    """
    Container module for Multi-VAE.

    Multi-VAE : Variational Autoencoder with Multinomial Likelihood

    """

    def __init__(self, p_dims, q_dims=None, dropout=0.5):
        super(MultiVAE, self).__init__()
        self.p_dims = p_dims
        if q_dims:
            assert q_dims[0] == p_dims[-1], "In and Out dimensions must equal to each other"
            assert q_dims[-1] == p_dims[0], "Latent dimension for p- and q- network mismatches."
            self.q_dims = q_dims
        else:
            self.q_dims = p_dims[::-1]

        # Last dimension of q- network is for mean and variance
        temp_q_dims = self.q_dims[:-1] + [self.q_dims[-1] * 2]
        self.q_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(temp_q_dims[:-1], temp_q_dims[1:])])
        self.p_layers = nn.ModuleList([nn.Linear(d_in, d_out) for
            d_in, d_out in zip(self.p_dims[:-1], self.p_dims[1:])])

        self.drop = nn.Dropout(dropout)
        self.init_weights()

    def forward(self, input):
        mu, logvar = self.encode(input)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def encode(self, input):
        h = F.normalize(input)
        h = self.drop(h)

        for i, layer in enumerate(self.q_layers):
            h = layer(h)
            if i != len(self.q_layers) - 1:
                h = F.tanh(h)
            else:
                mu = h[:, :self.q_dims[-1]]
                logvar = h[:, self.q_dims[-1]:]





        return mu, logvar

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def decode(self, z):
        h = z
        for i, layer in enumerate(self.p_layers):
            h = layer(h)
            if i != len(self.p_layers) - 1:
                h = F.tanh(h)
        return h

    def init_weights(self):
        for layer in self.q_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

        for layer in self.p_layers:
            # Xavier Initialization for weights
            size = layer.weight.size()
            fan_out = size[0]
            fan_in = size[1]
            std = np.sqrt(2.0/(fan_in + fan_out))
            layer.weight.data.normal_(0.0, std)

            # Normal Initialization for Biases
            layer.bias.data.normal_(0.0, 0.001)

### Loss funtion 정의

In [20]:
def loss_function_vae(recon_x, x, mu, logvar, anneal=1.0):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    KLD = -0.5 * torch.mean(torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=1))

    return BCE + anneal * KLD


Train 함수 정의

In [36]:
def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [83]:
def train(model, criterion, optimizer, train_data, device, args, is_VAE=True):
    """
    Train the model on the given training data.

    Parameters:
    - model: PyTorch model
    - criterion: Loss function
    - optimizer: Optimizer for training
    - train_data: csr_matrix, training data
    - device: PyTorch device (e.g., 'cuda' or 'cpu')
    - args: Arguments containing hyperparameters
    - is_VAE: Whether the model is a VAE (default: True)

    Returns:
    - train_loss: Average training loss over all batches
    """
    # Turn on training mode
    model.train()
    train_loss = 0.0
    start_time = time.time()
    global update_count

    for batch_idx, start_idx in enumerate(range(0, train_data.shape[0], args.batch_size)):
        end_idx = min(start_idx + args.batch_size, train_data.shape[0])
        data = train_data[start_idx:end_idx]
        data = naive_sparse2tensor(data).to(device)
        optimizer.zero_grad()

        if is_VAE:
            if args.total_anneal_steps > 0:
                anneal = min(args.anneal_cap, 1. * update_count / args.total_anneal_steps)
            else:
                anneal = args.anneal_cap

        # Forward pass and loss computation
        if is_VAE:
            recon_batch, mu, logvar = model(data)
            loss = loss_function_vae(recon_batch, data, mu, logvar, anneal)
        else:
            recon_batch = model(data)
            loss = criterion(recon_batch, data)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        update_count += 1

        # Log progress
        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                  'loss {:4.2f}'.format(
                      epoch, batch_idx, len(range(0, train_data.shape[0], args.batch_size)),
                      elapsed * 1000 / args.log_interval,
                      train_loss / args.log_interval))
            start_time = time.time()

    # Return average training loss
    return train_loss / len(range(0, train_data.shape[0], args.batch_size))


Evaluate 함수 정의

In [92]:
def evaluate(model, train_data, validation_data, loss_function, device, top_k=[10], anneal_cap=0.2, total_anneal_steps=20000):
    """
    Evaluate the Multi-VAE model on validation data.

    Parameters:
    - model: Multi-VAE model instance
    - train_data: csr_matrix, user-item interactions in the training set
    - validation_data: csr_matrix, user-item interactions in the validation set
    - loss_function: Loss function for VAE (e.g., loss_function_vae)
    - device: PyTorch device (e.g., 'cuda' or 'cpu')
    - top_k: List of k values for Recall@k and NDCG@k
    - anneal_cap: Maximum annealing factor for KL divergence
    - total_anneal_steps: Total steps for annealing KL divergence weight

    Returns:
    - avg_loss: Average evaluation loss
    - metrics: Dictionary containing Recall@k and NDCG@k for each k
    """
    model.eval()
    total_loss = 0.0
    update_count = 0
    anneal = 0.0
    recall_results = {k: [] for k in top_k}
    ndcg_results = {k: [] for k in top_k}

    num_users = validation_data.shape[0]

    with torch.no_grad():
        for user in range(num_users):  # User-wise evaluation
            # Get training and validation data for the current user
            train_row = train_data[user].toarray()
            val_row = validation_data[user].toarray()

            # Convert to PyTorch tensors
            train_tensor = torch.FloatTensor(train_row).to(device)
            val_tensor = torch.FloatTensor(val_row).to(device)

            # Annealing factor for KL divergence
            if total_anneal_steps > 0:
                anneal = min(anneal_cap, 1.0 * update_count / total_anneal_steps)

            # Forward pass
            recon_batch, mu, logvar = model(train_tensor)

            # Compute loss (using validation data)
            loss = loss_function(recon_batch, val_tensor, mu, logvar, anneal)
            total_loss += loss.item()

            # Exclude training interactions from recommendations
            recon_batch = recon_batch.cpu().numpy()
            recon_batch[train_row.nonzero()] = -np.inf  # Mask training items

            # Compute metrics for top_k
            for k in top_k:
                recall_results[k].append(Recall_at_k_batch(recon_batch, val_row, k))
                ndcg_results[k].append(NDCG_binary_at_k_batch(recon_batch, val_row, k))

            update_count += 1

    # Compute average loss and metrics
    avg_loss = total_loss / num_users
    metrics = {
        f"Recall@{k}": np.mean(recall_results[k]) for k in top_k
    }
    metrics.update({
        f"NDCG@{k}": np.mean(ndcg_results[k]) for k in top_k
    })

    print(f"Evaluation Loss: {avg_loss:.4f}")
    for k in top_k:
        print(f"Recall@{k}: {metrics[f'Recall@{k}']:.4f}, NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")

    return avg_loss, metrics


Metric 정의

In [93]:
def Recall_at_k_batch(X_pred, heldout_batch, k=10):
    """
    Compute Recall@k for binary relevance.

    Parameters:
    - X_pred: numpy.ndarray, predicted scores for all items
    - heldout_batch: numpy.ndarray or csr_matrix, true interactions for each user
    - k: int, cutoff for Recall@k

    Returns:
    - recall: numpy.ndarray, Recall@k for each user in the batch
    """
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    # Convert heldout_batch to dense array if it's a sparse matrix
    if isinstance(heldout_batch, np.ndarray):
        X_true_binary = heldout_batch > 0
    else:
        X_true_binary = (heldout_batch > 0).toarray()

    # Compute Recall@k
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    """
    Compute Normalized Discounted Cumulative Gain@k for binary relevance.

    Parameters:
    - X_pred: numpy.ndarray, predicted scores for all items
    - heldout_batch: numpy.ndarray or csr_matrix, true interactions for each user
    - k: int, cutoff for NDCG@k

    Returns:
    - ndcg: numpy.ndarray, NDCG@k for each user in the batch
    """
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    # Convert heldout_batch to dense array if it's a sparse matrix
    if isinstance(heldout_batch, np.ndarray):
        X_true_binary = heldout_batch > 0
    else:
        X_true_binary = (heldout_batch > 0).toarray()

    # Compute DCG
    tp = 1. / np.log2(np.arange(2, k + 2))
    DCG = (X_true_binary[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1)

    # Compute IDCG
    IDCG = np.array([tp[:min(n, k)].sum() for n in X_true_binary.sum(axis=1)])
    ndcg = DCG / IDCG
    ndcg[np.isnan(ndcg)] = 0.0  # Handle NaN for users with no interactions
    return ndcg


학습 진행

In [94]:
N = train_data.shape[0]

# Hyperparameters
learning_rate = 1e-3
anneal_cap = 0.2
total_anneal_steps = 20000
epochs = 10

# Initialize model, optimizer, and loss function
input_dim = train_data.shape[1]
p_dims = [200,600,input_dim]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiVAE(p_dims).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
update_count = 0

In [None]:
for epoch in range(1, args.epochs + 1):
    print(f"Epoch {epoch}/{args.epochs}")
    train_loss = train(model, loss_function_vae, optimizer, train_data, device, args, is_VAE=True)
    print(f"Training Loss: {train_loss:.4f}")

    val_loss, metrics = evaluate(model, train_data, val_data, loss_function_vae, device, top_k=[10])
    print(f"Validation Loss: {val_loss:.4f}")
    for k in [10]:
        print(f"Recall@{k}: {metrics[f'Recall@{k}']:.4f}, NDCG@{k}: {metrics[f'NDCG@{k}']:.4f}")


Epoch 1/20
Training Loss: 1270.9060
Evaluation Loss: 57.6237
Recall@10: 0.0724, NDCG@10: 0.0545
Validation Loss: 57.6237
Recall@10: 0.0724, NDCG@10: 0.0545
Epoch 2/20
Training Loss: 1199.4623
Evaluation Loss: 89.8114
Recall@10: 0.0870, NDCG@10: 0.0650
Validation Loss: 89.8114
Recall@10: 0.0870, NDCG@10: 0.0650
Epoch 3/20
Training Loss: 1176.4330
Evaluation Loss: 100.2679
Recall@10: 0.0924, NDCG@10: 0.0709
Validation Loss: 100.2679
Recall@10: 0.0924, NDCG@10: 0.0709
Epoch 4/20
Training Loss: 1163.9082
Evaluation Loss: 104.3858
Recall@10: 0.1019, NDCG@10: 0.0782
Validation Loss: 104.3858
Recall@10: 0.1019, NDCG@10: 0.0782
Epoch 5/20
Training Loss: 1152.4798
Evaluation Loss: 107.7782
Recall@10: 0.1081, NDCG@10: 0.0822
Validation Loss: 107.7782
Recall@10: 0.1081, NDCG@10: 0.0822
Epoch 6/20
Training Loss: 1145.5294
Evaluation Loss: 109.0502
Recall@10: 0.1099, NDCG@10: 0.0833
Validation Loss: 109.0502
Recall@10: 0.1099, NDCG@10: 0.0833
Epoch 7/20
Training Loss: 1141.6498
Evaluation Loss: 105

### Submission 용 학습

train/valid set으로 나누지 않은 전체 데이터를 포함하고 있는 interaction matrix를 학습시킵니다

In [None]:
N = interaction_matrix.shape[0]

# Hyperparameters
learning_rate = 1e-3
anneal_cap = 0.2
total_anneal_steps = 20000
epochs = 10

# Initialize model, optimizer, and loss function
input_dim = train_data.shape[1]
p_dims = [200,600,input_dim]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiVAE(p_dims).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
update_count = 0




In [None]:
for epoch in range(1, args.epochs + 1):
    print(f"Epoch {epoch}/{args.epochs}")
    train_loss = train(model, loss_function_vae, optimizer, interaction_matrix, device, args, is_VAE=True)
    print(f"Training Loss: {train_loss:.4f}")

### Submission 만들기

전체 데이터셋으로 학습된 모델을 이용해 interaction이 진행된 아이템을 제외하고 나머지를 model을 사용해 10개의 아이템을 추천합니다

In [None]:
def generate_recommendations(model, interaction_matrix, user_mapping, item_mapping, top_k=10):
    """
    Generate top-k recommendations for each user after masking existing interactions.

    Parameters:
    - model: Trained VAE model
    - interaction_matrix: Original interaction matrix (csr_matrix)
    - user_mapping: Mapping from original to reindexed user IDs
    - item_mapping: Mapping from original to reindexed item IDs
    - top_k: Number of recommendations to generate per user

    Returns:
    - recommendations: pd.DataFrame, user-to-top-k recommended items
    """
    # Reconstruct the interaction matrix using the trained model
    interaction_tensor = torch.FloatTensor(interaction_matrix.toarray()).to(device)
    model.eval()
    with torch.no_grad():
        reconstructed = model(interaction_tensor)[0].cpu().numpy()

    # Mask already interacted items
    masked_reconstructed = reconstructed * (interaction_matrix.toarray() == 0)

    # Get top-k recommendations for each user
    recommendations = []
    for user_id, scores in enumerate(masked_reconstructed):
        top_items = np.argsort(scores)[-top_k:][::-1]  # Get top-k items sorted by score
        for item_id in top_items:
            recommendations.append([user_id, item_id])

    # Map back to original user and item IDs
    reverse_user_mapping = {v: k for k, v in user_mapping.items()}
    reverse_item_mapping = {v: k for k, v in item_mapping.items()}
    recommendations_df = pd.DataFrame(recommendations, columns=["user", "item"])
    recommendations_df["user"] = recommendations_df["user"].map(reverse_user_mapping)
    recommendations_df["item"] = recommendations_df["item"].map(reverse_item_mapping)

    return recommendations_df

In [None]:
recommendations = generate_recommendations(model, interaction_matrix, usr2idx_dict, item2idx_dict, top_k=10)


In [None]:
recommendations

In [None]:
recommendations.to_csv("submission.csv", index=False)