In [153]:
import os

from typing import Any, Mapping, List, Tuple, Dict

import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from loguru import logger
from datetime import datetime
from itertools import chain

import torch
from torch import nn
from torch.nn.init import constant_, kaiming_normal_
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn.utils.rnn import pack_sequence, unpack_sequence

In [16]:
BASE_DIR = "/Users/artemvopilov/Programming/yandex_cup_2023"

In [None]:
DATA_DIR = f"{BASE_DIR}/data"

TRAIN_DF_PATH = f"{DATA_DIR}/train.csv"
TEST_DF_PATH = f"{DATA_DIR}/test.csv"

NORMED_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_embeddings"
PCA_EMBEDDINGS_DIR = f"{BASE_DIR}/pca_embeddings"
VAE_EMBEDDINGS_DIR = f"{BASE_DIR}/vae_embeddings"
NORMED_LSTM_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_lstm_embeddings"

In [18]:
DEVICE = "cpu"

TAGS_N = 256

INPUT_DIM = 768
HIDDEN_DIM = 256
OUTPUT_DIM = TAGS_N

EPOCHS = 10

### Read data

In [19]:
train_df = pd.read_csv(TRAIN_DF_PATH)
test_df = pd.read_csv(TEST_DF_PATH)

In [20]:
track_id_to_embeddings = {}
for fn in tqdm(os.listdir(NORMED_EMBEDDINGS_DIR)):
    fp = f"{NORMED_EMBEDDINGS_DIR}/{fn}"

    track_id = fn.split('.')[0]
    embeddings = np.load(fp).astype(np.float32)
    track_id_to_embeddings[track_id] = embeddings

  0%|          | 0/76714 [00:00<?, ?it/s]

### Dataset

In [21]:
class LSTMDataset(Dataset):
    def __init__(self, df: pd.DataFrame, track_id_to_embeddings: Dict[str, np.ndarray[np.float64]], tags_n: int, is_testing=False):
        self._df = df
        self._track_id_to_embeddings = track_id_to_embeddings
        self._tags_n = tags_n
        self._is_testing = is_testing
        
    def __len__(self):
        return self._df.shape[0]

    def __getitem__(self, index: int) -> Tuple[str, np.ndarray[np.float64], List[np.ndarray[np.int64]]]:
        row = self._df.iloc[index]
        track_id = row["track"]
        embeddings = self._track_id_to_embeddings[str(track_id)]
        if self._is_testing:
            return track_id, embeddings, np.array([])
        tags = [int(x) for x in row["tags"].split(',')]
        target = np.zeros(self._tags_n)
        target[tags] = 1
        return track_id, embeddings, target

In [22]:
def collate_fn(b):
    track_ids = torch.from_numpy(np.vstack([x[0] for x in b]))
    embeddings = [torch.from_numpy(x[1]) for x in b]
    targets = torch.from_numpy(np.vstack([x[2] for x in b]))
    return track_ids, embeddings, targets

### Model

In [184]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.apply(self._init_layer)

    def forward(self, embeddings):
        embeddings = pack_sequence(embeddings, enforce_sorted=False)
        cell_states, (final_hidden_state, final_cell_state) = self.lstm(embeddings)
        first_layer_hidden_state = final_hidden_state[0, :, :]
        return self.fc(first_layer_hidden_state), unpack_sequence(cell_states)

    def reset(self) -> None:
        self.apply(self._init_layer)

    @staticmethod
    def _init_layer(layer: nn.Module) -> None:
        if isinstance(layer, nn.Linear):
            kaiming_normal_(layer.weight.data)
            if layer.bias is not None:
                constant_(layer.bias.data, 0)

### Trainer

In [185]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss = None
    alpha = 0.8
    for iteration, data in enumerate(loader):
        optimizer.zero_grad()
        _, embeddings, target = data
        
        embeddings = [x.to(DEVICE) for x in embeddings]
        target = target.to(DEVICE)
        
        predictions, _ = model(embeddings)
        
        loss = criterion(predictions, target)
        loss.backward()
        optimizer.step()
        
        if running_loss is None:
            running_loss = loss.item()
        else:
            running_loss = alpha * loss.item() + (1 - alpha) * loss.item()
        if iteration % 100 == 0:
            logger.info("{} batch {} loss {}".format(datetime.now(), iteration + 1, running_loss))

### Predictor

In [186]:
def predict(model, loader):
    model.eval()
    track_ids = []
    predictions = []
    cell_states = []
    with torch.no_grad():
        for data in loader:
            track_id, embeddings, _ = data
            
            embeddings =  [x.to(DEVICE) for x in embeddings]
            
            batch_predictions, batch_cell_states = model(embeddings)

            track_ids.append(track_id.numpy())
            predictions.append(batch_predictions.detach().cpu().numpy())
            cell_states.append([cs.detach().cpu().numpy() for cs in batch_cell_states])
    track_ids = np.vstack(track_ids).ravel()
    predictions = np.vstack(predictions)
    cell_states = list(chain.from_iterable(cell_states))
    return track_ids, predictions, cell_states

### Train

In [187]:
train_dataset = LSTMDataset(train_df, track_id_to_embeddings, TAGS_N, False)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [None]:
model = LSTMModel(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
criterion = nn.CrossEntropyLoss()

model = model.to(DEVICE)
criterion = criterion.to(DEVICE)
optimizer = Adam(model.parameters())

for epoch in tqdm(range(EPOCHS)):
    train_epoch(model, train_loader, criterion, optimizer)

  0%|          | 0/10 [00:00<?, ?it/s]

[32m2023-11-10 00:45:37.072[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:45:37.072460 batch 1 loss 24.6972738802433[0m
[32m2023-11-10 00:47:19.991[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:47:19.991564 batch 101 loss 19.912205085158348[0m
[32m2023-11-10 00:49:04.949[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:49:04.949913 batch 201 loss 16.165155679918826[0m
[32m2023-11-10 00:50:50.134[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:50:50.134544 batch 301 loss 13.82649103924632[0m
[32m2023-11-10 00:52:33.497[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:52:33.497937 batch 401 loss 16.763167725875974[0m
[32m2023-11-10 00:54:22.987[0m | [1mINFO    [0m | [36m__main__[0m:[36mtrain_epoch[0m:[36m23[0m - [1m2023-11-10 00:

### Inference

In [None]:
inference_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
inference_df.head()

In [None]:
inference_df.shape

In [None]:
inference_dataset = LSTMDataset(inference_df, track_id_to_embeddings, TAGS_N, True)
inference_loader = DataLoader(inference_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
inference_track_ids, inference_predictions, inference_cell_states = predict(model, inference_loader)

In [None]:
len(inference_track_ids), len(inference_predictions), len(inference_cell_states)

### Save predictions

In [None]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in 
    zip(inference_track_ids, inference_predictions)
])

In [None]:
predictions_df.head()

In [None]:
predictions_df.shape

In [None]:
predictions_df.to_csv('prediction_lstm_normed_2.csv', index=False)

### Save embeddings

In [None]:
track_id_to_lstm_embedding = {}

In [None]:
os.mkdir(LSTM_EMBEDDINGS_DIR)

In [None]:
for ti, embeddings in tqdm(track_id_to_pca_embeddings.items()):
    fn = f"{ti}.npy"
    fp = f"{NORMED_LSTM_EMBEDDINGS_DIR}/{fn}"
    np.save(fp, embeddings)