In [153]:
import os

from typing import Any, Mapping, List, Tuple, Dict

import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from loguru import logger
from datetime import datetime

import torch
from torch import nn
from torch.nn.init import constant_, kaiming_normal_
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pad_sequence

In [154]:
BASE_DIR = "/Users/artemvopilov/Programming/yandex_cup_2023"

In [155]:
DATA_DIR = f"{BASE_DIR}/data"

TRAIN_DF_PATH = f"{DATA_DIR}/train.csv"
TEST_DF_PATH = f"{DATA_DIR}/test.csv"

NORMED_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_embeddings"
PCA_EMBEDDINGS_DIR = f"{BASE_DIR}/pca_embeddings"
VAE_EMBEDDINGS_DIR = f"{BASE_DIR}/vae_embeddings"

In [156]:
DEVICE = "cpu"

TAGS_N = 256

INPUT_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = TAGS_N

EPOCHS = 10

### Read data

In [157]:
train_df = pd.read_csv(TRAIN_DF_PATH)
test_df = pd.read_csv(TEST_DF_PATH)

In [158]:
track_id_to_embeddings = {}
for fn in tqdm(os.listdir(VAE_EMBEDDINGS_DIR)):
    fp = f"{VAE_EMBEDDINGS_DIR}/{fn}"

    track_id = fn.split('.')[0]
    embeddings = np.load(fp).astype(np.float32)
    track_id_to_embeddings[track_id] = embeddings

  0%|          | 0/76714 [00:00<?, ?it/s]

### Dataset

In [159]:
class LSTMDataset(Dataset):
    def __init__(self, df: pd.DataFrame, track_id_to_embeddings: Dict[str, np.ndarray[np.float64]], tags_n: int, is_testing=False):
        self._df = df
        self._track_id_to_embeddings = track_id_to_embeddings
        self._tags_n = tags_n
        self._is_testing = is_testing
        
    def __len__(self):
        return self._df.shape[0]

    def __getitem__(self, index: int) -> Tuple[str, np.ndarray[np.float64], List[np.ndarray[np.int64]]]:
        row = self._df.iloc[index]
        track_id = row["track"]
        embeddings = self._track_id_to_embeddings[str(track_id)]
        if self._is_testing:
            return track_id, embeddings, np.array([])
        tags = [int(x) for x in row["tags"].split(',')]
        target = np.zeros(self._tags_n)
        target[tags] = 1
        return track_id, embeddings, target

In [160]:
def collate_fn(b):
    track_ids = torch.from_numpy(np.vstack([x[0] for x in b]))
    embeddings = [torch.from_numpy(x[1]) for x in b]
    targets = torch.from_numpy(np.vstack([x[2] for x in b]))
    return track_ids, embeddings, targets

### Model

In [161]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.apply(self._init_layer)

    def forward(self, embeddings):
        rnn_outputs, (hn, cn) = self.rnn(embeddings)
        return self.fc(hn[0, :, :])

    def reset(self) -> None:
        self.apply(self._init_layer)

    @staticmethod
    def _init_layer(layer: nn.Module) -> None:
        if isinstance(layer, nn.Linear):
            kaiming_normal_(layer.weight.data)
            if layer.bias is not None:
                constant_(layer.bias.data, 0)

### Trainer

In [162]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss = None
    alpha = 0.8
    for iteration, data in enumerate(loader):
        optimizer.zero_grad()
        _, embeddings, target = data
        
        embeddings = pad_sequence([x.to(DEVICE) for x in embeddings], batch_first=True)
        target = target.to(DEVICE)
        
        model_outputs = model(embeddings)
        
        loss = criterion(model_outputs, target)
        loss.backward()
        optimizer.step()
        
        if running_loss is None:
            running_loss = loss.item()
        else:
            running_loss = alpha * loss.item() + (1 - alpha) * loss.item()
        if iteration % 100 == 0:
            logger.info("{} batch {} loss {}".format(datetime.now(), iteration + 1, running_loss))

### Predictor

In [163]:
def predict(model, loader):
    model.eval()
    track_ids = []
    predictions = []
    with torch.no_grad():
        for data in loader:
            track_id, embeddings, _ = data
            
            embeddings =  pad_sequence([x.to(DEVICE) for x in embeddings], batch_first=True)
            
            model_outputs = model(embeddings)

            track_ids.append(track_id.numpy())
            predictions.append(model_outputs.detach().cpu().numpy())
    predictions = np.vstack(predictions)
    track_ids = np.vstack(track_ids).ravel()
    return track_ids, predictions

### Train

In [164]:
train_dataset = LSTMDataset(train_df, track_id_to_embeddings, TAGS_N, False)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [None]:
model = LSTMModel(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
criterion = nn.CrossEntropyLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)
optimizer = Adam(model.parameters())

for epoch in tqdm(range(EPOCHS)):
    train_epoch(model, train_loader, criterion, optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

[32m2023-11-06 20:11:29.418[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 20:11:29.418800 batch 1 loss 20.898002065718174[0m
[32m2023-11-06 20:11:40.061[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 20:11:40.061096 batch 101 loss 19.238709829747677[0m
[32m2023-11-06 20:11:49.440[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 20:11:49.440857 batch 201 loss 17.855387296527624[0m
[32m2023-11-06 20:12:01.276[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 20:12:01.276574 batch 301 loss 16.847806312143803[0m
[32m2023-11-06 20:12:10.337[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 20:12:10.337045 batch 401 loss 19.947432152926922[0m
[32m2023-11-06 20:12:19.272[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-06 

### Inference

In [None]:
test_dataset = LSTMDataset(test_df, track_id_to_embeddings, TAGS_N, True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
train_inference_dataset = LSTMDataset(train_df, track_id_to_embeddings, TAGS_N, False)
train_inference_loader = DataLoader(train_inference_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
test_track_ids, test_predictions = predict(model, test_loader)

In [None]:
train_inference_track_ids, train_inference_predictions = predict(model, train_inference_loader)

In [None]:
len(list(test_track_ids) + list(train_inference_track_ids))

In [None]:
len(list(test_predictions) + list(train_inference_predictions))

In [None]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in 
    zip(list(test_track_ids) + list(train_inference_track_ids), list(test_predictions) + list(train_inference_predictions))
])

In [None]:
predictions_df.head()

In [None]:
predictions_df.shape

In [None]:
predictions_df.to_csv('prediction_lstm_vae.csv', index=False)