In [126]:
import os

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from itertools import chain

import hnswlib

In [127]:
BASE_DIR = "/Users/artemvopilov/Programming/yandex_cup_2023"

In [128]:
DATA_DIR = f"{BASE_DIR}/data"

TRAIN_DF_PATH = f"{DATA_DIR}/train.csv"
TEST_DF_PATH = f"{DATA_DIR}/test.csv"

NORMED_EMBEDDINGS_DIR = f"{BASE_DIR}/normed_embeddings"
PCA_EMBEDDINGS_DIR = f"{BASE_DIR}/pca_embeddings"
VAE_EMBEDDINGS_DIR = f"{BASE_DIR}/vae_embeddings"

In [129]:
TAGS_N = 256

DIM = 64

### Read data

In [130]:
train_df = pd.read_csv(TRAIN_DF_PATH)
test_df = pd.read_csv(TEST_DF_PATH)

In [131]:
train_track_ids = set(train_df["track"].unique())
test_track_ids = set(test_df["track"].unique())

In [132]:
train_track_id_to_tags = {}
for _, row in tqdm(train_df.iterrows()):
    track_id = row["track"]
    tags = list(map(int, row["tags"].split(",")))
    train_track_id_to_tags[track_id] = tags

0it [00:00, ?it/s]

### Read embeddings

In [133]:
track_id_to_embeddings = {}
for fn in tqdm(os.listdir(VAE_EMBEDDINGS_DIR)):
    fp = f"{VAE_EMBEDDINGS_DIR}/{fn}"

    track_id = fn.split('.')[0]
    embeddings = np.load(fp)
    track_id_to_embeddings[track_id] = embeddings

  0%|          | 0/76714 [00:00<?, ?it/s]

In [134]:
len(track_id_to_embeddings)

76714

In [146]:
track_ids = []
embeddings = []
for ti, embeds in tqdm(track_id_to_embeddings.items()):
    if int(ti) not in train_track_ids:
        continue
    track_ids.append(ti)
    embeddings.append(embeds[-1])

  0%|          | 0/76714 [00:00<?, ?it/s]

In [147]:
len(embeddings)

51134

### KNN index

In [148]:
knn_index = hnswlib.Index(space='cosine', dim=DIM)

In [149]:
knn_index.init_index(len(embeddings))

In [150]:
knn_index.add_items(embeddings)

### Predict

In [151]:
all_track_ids = []
all_embeddings = []
for ti, embeds in tqdm(track_id_to_embeddings.items()):
    all_track_ids.append(ti)
    all_embeddings.append(np.mean(embeds, axis=0))

  0%|          | 0/76714 [00:00<?, ?it/s]

In [152]:
track_id_to_scores = {}
for ti, embed in tqdm(zip(all_track_ids, all_embeddings)):
    labels, distances = knn_index.knn_query(embed, k=21)

    if ti in train_track_ids:
        labels = labels[0][1:]
        distances = distances[0][1:]
    else:
        labels = labels[0][:-1]
        distances = distances[0][:-1]

    distances_sum = np.sum(distances)
    scores = np.zeros(TAGS_N)
    for l, d in zip(labels, distances):
        track_id = track_ids[l]
        tags = train_track_id_to_tags[int(track_id)]

        target = np.zeros(TAGS_N)
        target[tags] = 1
        scores += target * d / distances_sum
    track_id_to_scores[ti] = scores

0it [00:00, ?it/s]

### Save scores 

In [153]:
predictions_df = pd.DataFrame([
    {'track': track, 'prediction': ','.join([str(p) for p in probs])}
    for track, probs in track_id_to_scores.items()
])

In [154]:
predictions_df.head()

Unnamed: 0,track,prediction
0,531,"0.8447252453619272,0.15648790450836597,0.0,0.0..."
1,33632,"0.4966496658542636,0.2927575639649057,0.098693..."
2,75667,"0.7020615925624607,0.45871455760668917,0.05477..."
3,65474,"0.8042526504184774,0.35165927924100365,0.05420..."
4,23421,"0.3020745519927329,0.44304889087254423,0.14041..."


In [155]:
predictions_df.shape

(76714, 2)

In [156]:
predictions_df.to_csv('prediction_knn_last_e_vae.csv', index=False)