In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import glob
import pandas as pd
import polars as pl

import gc
import logging

import hydra
import numpy as np
import copy
import torch
import torchinfo
from dotenv import load_dotenv
from hydra.utils import call, instantiate
from omegaconf import DictConfig, OmegaConf
from torch.cuda.amp import GradScaler

from csi.base.utils import init_model, seed_everything
from csi.submission import make_submission
from csi.training.data.dataset import filter_tracks
from csi.training.loop.loop import train_one_epoch
from csi.training.loop.utils import (
    clean_old_content,
    freeze_layers,
    load_fold_checkpoint,
    save_checkpoint,
)
from csi.training.metrics.ndcg import compute_ndcg
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

from csi.base.model.predict import predict
from csi.base.utils import batch_to_device
from csi.training.loop.utils import split_by_batch_size
from tqdm import tqdm
from collections import defaultdict

logger = logging.getLogger(__name__)
logger.info = print

from tqdm import tqdm

from mlflow import MlflowClient
import logging
import mlflow
from pathlib import Path

sys.path.append("../")

In [3]:
FINAL_ARTIFACTS = "final_artifacts_v2"
RESULTS = "results_v2"
POWER = 500

In [5]:
client = MlflowClient()
experiment_id = client.get_experiment_by_name("test-test").experiment_id

In [6]:
runs = mlflow.search_runs(experiment_ids=[experiment_id])

In [7]:
good_runs = runs[runs["metrics.AvgNDCG_at_100"] > 0.536]

In [8]:
base_path = Path(os.path.join("/home/yskhnykov/yandex_cup/", FINAL_ARTIFACTS))

In [10]:
name2metric = {}
for run_id, metric, name in zip(good_runs["run_id"], good_runs["metrics.AvgNDCG_at_100"], good_runs["tags.mlflow.runName"]):
    local_path = mlflow.artifacts.download_artifacts(run_id=run_id, dst_path=base_path / name)
    name2metric[name] = metric

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
def make_fold_submission(cfg, model, test_loader, candidates_from_fold=300):
    embeddings = []
    track_ids = []
    for batch in test_loader:
        batch = batch_to_device(batch, cfg.environment.device)
        outs = predict(model, batch)
        embs = outs["embedding"]
        track_ids.append(outs["track_id"].reshape(-1, 1))
        embeddings.append(embs)

    embeddings = torch.vstack(embeddings).detach().cpu().numpy()
    track_ids = torch.vstack(track_ids).detach().cpu().numpy()

    if cfg.nearest_neighbors_search.normalize_embeddings:
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    res = {}

    emb_indices = np.arange(len(embeddings))
    mini_batch_size = 5000
    embeddingsT = embeddings.T
    for ind in tqdm(split_by_batch_size(emb_indices, mini_batch_size)):
        track_id_batch = track_ids[ind]
        emb_batch = embeddings[ind]
        similarities = np.dot(emb_batch, embeddingsT)
        top_k_indices = np.argsort(-similarities, axis=1)[:, : candidates_from_fold + 1]
        top_k_indices = top_k_indices[top_k_indices != ind.reshape(-1, 1)]
        top_tracks_similarities = np.take_along_axis(similarities, top_k_indices.reshape(len(ind), candidates_from_fold), axis=1)
        top_tracks = track_ids[top_k_indices].reshape(len(ind), candidates_from_fold)
        for track_id, tracks, sims in zip(track_id_batch.flatten(), top_tracks, top_tracks_similarities):
            res[int(track_id)] = [(int(t), float(s)) for t, s in zip(tracks, sims)]
    return res

In [12]:
def init_cfg(cfg_path):
    from hydra import compose, initialize
    from omegaconf import OmegaConf

    with initialize(version_base=None, config_path=str(Path(cfg_path).parent)):
        try:
            cfg = compose(config_name="config", overrides=["+read_filtered_clique2versions=null"])
        except:
            cfg = compose(config_name="config", overrides=["read_filtered_clique2versions=null"])

    checkpoints_folder = "artifacts_" + str(Path(cfg_path).parent.parent.name) + "/model_checkpoints"
    cfg["path_to_fold_checkpoints"] = checkpoints_folder
    cfg["read_filtered_clique2versions"] = None
    cfg["environment"]["device"] = "cuda:1"
    return cfg

In [13]:
def get_predictions(cfg_path, candidates_from_fold=300):
    
    cfg = init_cfg(cfg_path)
    seed_everything(cfg.environment.seed)
    
    logger.info("Reading clique2tracks")
    clique2tracks = call(
        cfg.read_clique2versions,
        _convert_="partial",
    )
    
    filtered_clique2tracks = call(
        cfg.read_filtered_clique2versions,
        _convert_="partial",
    )
    
    cliques_splits = call(cfg.split_cliques, clique2tracks, _convert_="partial")
    
    test_dataset = call(
        cfg.test_data.dataset,
        tracks_ids=np.load(cfg.test_data.test_ids_path),
        track2clique=None,
        clique2tracks=None,
        _convert_="partial",
    )
    test_loader = instantiate(cfg.test_data.dataloader, test_dataset, _convert_="partial")
    
    
    fold_results = []
    for fold, (
        (train_track2clique, train_clique2tracks),
        (val_track2clique, val_clique2tracks),
    ) in enumerate(cliques_splits):   
        model = init_model(cfg).to(cfg.environment.device)
    
        if cfg.path_to_fold_checkpoints is not None:
            model = load_fold_checkpoint(model, cfg.path_to_fold_checkpoints, fold)
    
        fold_results.append(
            make_fold_submission(cfg, model, test_loader, candidates_from_fold)
        )
        torch.cuda.empty_cache()
        gc.collect()
    return fold_results

In [14]:
def pack_to_df(predictions, metric):
    track_ids = []
    candidates_ids = []
    candidates_scores = []
    fold_indices = []
    for fold_idx, fold in enumerate(predictions):
        for track_id, cands_with_scores in fold.items():
            for cand, score in cands_with_scores:
                track_ids.append(int(track_id))
                candidates_ids.append(int(cand))
                candidates_scores.append(float(score))
                fold_indices.append(int(fold_idx))
    df = pd.DataFrame({
        "track_id": track_ids, 
        "candidate_id": candidates_ids, 
        "candidate_score": candidates_scores,
        "fold_idx": fold_indices
    }).assign(metric=float(metric))
    return df

In [15]:
for model_name in os.listdir(FINAL_ARTIFACTS):
    print(model_name)
    if os.path.exists(f"{RESULTS}/{model_name}.pq"):
        print(f"Skipping existing {model_name}")
        continue
    try:
        metric = name2metric[model_name]
        predictions = get_predictions(f"{FINAL_ARTIFACTS}/{model_name}/hydra/config", POWER)
        pack_to_df(predictions, metric).to_parquet(f"{RESULTS}/{model_name}.pq", index=False)
    except Exception as e:
        print(e)
        print("-" * 180)

hgnetv2_b5_metric_learning_pairce_n_6folds04_13_04
Skipping existing hgnetv2_b5_metric_learning_pairce_n_6folds04_13_04
hgnetv2_b5_metric_learning_pairce_test_0_58_6folds03_17_43
Skipping existing hgnetv2_b5_metric_learning_pairce_test_0_58_6folds03_17_43
hgnetv2_b5_drop_cliques_test_0_6_6folds01_11_06
Skipping existing hgnetv2_b5_drop_cliques_test_0_6_6folds01_11_06
hgnetv2_b5_metric_learning_drop_cliques_test_0_6_6folds02_18_27
Skipping existing hgnetv2_b5_metric_learning_drop_cliques_test_0_6_6folds02_18_27
hgnetv2_b5_metric_learning_cosineemb_test_0_58_6folds04_04_11
Skipping existing hgnetv2_b5_metric_learning_cosineemb_test_0_58_6folds04_04_11
hgnetv2_b5_metric_learning_big_margin_drop_cliques_test_0_6_6folds02_19_23
Skipping existing hgnetv2_b5_metric_learning_big_margin_drop_cliques_test_0_6_6folds02_19_23
hgnetv2_b5_metric_learning_pairce_n_6folds04_18_31
Reading clique2tracks


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:26<00:00,  7.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:26<00:00,  7.21s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:25<00:00,  7.12s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:26<00:00,  7.23s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:2

hgnetv2_b5_metric_learning_big_margin_drop_cliques_test_0_58_6folds03_15_20
Skipping existing hgnetv2_b5_metric_learning_big_margin_drop_cliques_test_0_58_6folds03_15_20
hgnetv2_b5_metric_learning_pairce_test_0_58_6folds04_04_21
Skipping existing hgnetv2_b5_metric_learning_pairce_test_0_58_6folds04_04_21
hgnetv2_b5_metric_learning_pairce_test_0_58_6folds03_17_35
Skipping existing hgnetv2_b5_metric_learning_pairce_test_0_58_6folds03_17_35
hgnetv2_b5_metric_learning_pairce_6folds04_13_03
Skipping existing hgnetv2_b5_metric_learning_pairce_6folds04_13_03
hgnetv2_b5_metric_learning_pairce_n_6folds04_18_26
Reading clique2tracks


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:24<00:00,  7.06s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:22<00:00,  6.90s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:21<00:00,  6.81s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:21<00:00,  6.82s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:2

hgnetv2_b5_metric_learning_even_bigger_margin_drop_cliques_test_0_6_6folds02_21_14
Skipping existing hgnetv2_b5_metric_learning_even_bigger_margin_drop_cliques_test_0_6_6folds02_21_14
hgnetv2_b5_metric_learning_pairce_6folds04_04_29
Skipping existing hgnetv2_b5_metric_learning_pairce_6folds04_04_29


In [20]:
data_raw_paths = glob.glob("results/*")

In [21]:
def save_tracks_to_file(data, output_path):
    df = pd.DataFrame(
        {
            'query_trackid': list(data.keys()),
            'track_ids': [
                ' '.join(map(lambda x: str(int(x)), track_ids)) for track_ids in data.values()
            ],
        }
    )
    df['output'] = df['query_trackid'].astype(str) + ' ' + df['track_ids'].astype(str)
    df.sort_values("query_trackid", inplace=True)
    df[['output']].to_csv(output_path, index=False, header=False)

In [47]:
def adjust_score(df):
    assert 165_510_000 == df.shape[0], f"{df.shape[0]}"
    return df.with_columns(
        (pl.col("metric") * (pl.col("candidate_score") + 1) / 2).alias("candidate_score")
    )

def groupby_score(df, rank_fn=None):
    res = (
        df.group_by(["track_id", "candidate_id"])
        .agg([
            pl.sum("candidate_score").alias("candidate_score_sum"), 
            pl.count("candidate_score").alias("candidate_cnt"),
        ])
        .with_columns(
            ((pl.col("candidate_score_sum").rank("dense", descending=True).over("track_id") - 1) / 6).alias("rank_sum")
        )  
        .drop(["candidate_score_sum"])
    )
    res = res.with_columns(
        pl.lit(1).alias("candidate_cnt"),
    )
    if rank_fn is not None:
        res = res.with_columns(
            rank_fn(pl.col("rank_sum")).alias("rank_sum")
        )
    return res.select(["track_id", "candidate_id", "rank_sum", "candidate_cnt"])


def update_score(df1, df2):
    return (
        pl.concat([df1, df2])
        .group_by(["track_id", "candidate_id"])
        .agg([
            pl.sum("rank_sum").alias("rank_sum"),
            pl.sum("candidate_cnt").alias("candidate_cnt"),
        ])
        .select(["track_id", "candidate_id", "rank_sum", "candidate_cnt"])
    )

In [91]:
def take_top(name2metric, topk=1):
    return dict(sorted(name2metric.items(), key=lambda x: -x[1])[:topk])

In [93]:
name2metric_copy = copy.deepcopy(name2metric)

In [94]:
TOP = 12

In [95]:
total_df = None
for model_name, score in tqdm(take_top(name2metric_copy, TOP).items()):
    data = groupby_score(
        adjust_score(pl.read_parquet(f"{RESULTS}/{model_name}.pq")), 
        lambda x: x,
    )
    if total_df is None:
        total_df = data
    else:
        total_df = update_score(total_df, data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [05:43<00:00, 28.64s/it]


In [96]:
total_df = total_df.with_columns(
    ((pl.col("rank_sum") + POWER * (len(name2metric_copy) - pl.col("candidate_cnt"))) / len(name2metric_copy)).alias("mean_rank")
)

In [97]:
total_df_sorted = total_df.sort(["track_id", "mean_rank"])

top_100_df = (
    total_df_sorted
    .group_by("track_id")
    .head(100)
)

In [98]:
top_100_df

track_id,candidate_id,rank_sum,candidate_cnt,mean_rank
i64,i64,f64,i32,f64
1,44929,20.833333,12,185.307018
1,27655,25.666667,12,185.561404
1,12945,35.666667,12,186.087719
1,39321,40.833333,12,186.359649
1,47143,40.833333,12,186.359649
…,…,…,…,…
55170,12795,411.666667,11,232.192982
55170,3535,431.5,11,233.236842
55170,13038,448.5,11,234.131579
55170,51742,451.666667,11,234.298246


In [99]:
data = defaultdict(list)
for track_id, candidate_id in top_100_df.select(["track_id", "candidate_id"]).iter_rows():
    data[track_id].append(candidate_id)

data = dict(data)

In [100]:
save_tracks_to_file(data, "winner_solutions/blend_v25.csv")

In [39]:
def get_neighbors(df_path):
    df = pd.read_csv(df_path, header=None)
    track2sourceneighbors = {}
    for row in df.itertuples(index=False):
        track, *original_neighbors = map(int, row[0].split())
        track2sourceneighbors[track] = original_neighbors
    return track2sourceneighbors

CANDIDATES_FROM_FOLDS = TOP_K = 100

def reduce_by_subm(fold_top_tracks, importance_ranks=None):
    if importance_ranks is None:
        importance_ranks = [0] * len(fold_top_tracks)
    ranks = {}
    counter = {}
    for ir, fold in zip(importance_ranks, fold_top_tracks):
        for track_id, top_tracks in fold.items():
            if track_id not in ranks:
                ranks[track_id] = defaultdict(int)
                counter[track_id] = defaultdict(int)
            for rank, recommended_track in enumerate(top_tracks):
                ranks[track_id][recommended_track] += ir + rank
                counter[track_id][recommended_track] += 1

    candidates_with_score = {}
    for track_id, candidates_with_rank in ranks.items():
        if track_id not in candidates_with_score:
            candidates_with_score[track_id] = defaultdict(float)
        for recommended_track, total_rank in candidates_with_rank.items():
            cnt = counter[track_id][recommended_track]
            total_rank += CANDIDATES_FROM_FOLDS * (len(fold_top_tracks) - cnt)
            candidates_with_score[track_id][recommended_track] = total_rank

    res = {}
    for track_id, cws in candidates_with_score.items():
        best_tracks = [x[0] for x in sorted(cws.items(), key=lambda x: x[1])[:TOP_K]]
        res[track_id] = best_tracks

    return res

In [89]:
data = reduce_by_subm([
    get_neighbors("winner_solutions/blend_v24.csv"),
    get_neighbors("winner_solutions/true_blend_v21.csv"),
    get_neighbors("winner_solutions/true_blend_v23.csv"),
    get_neighbors("winner_solutions/true_blend_v16_500.csv"),
    get_neighbors("winner_solutions/true_blend_v15.csv"),
    get_neighbors("winner_solutions/true_blend_v22.csv"), # best
])

In [105]:
data = reduce_by_subm([
    get_neighbors("winner_solutions/blend_v24.csv"),
    get_neighbors("winner_solutions/true_blend_v21.csv"),
    get_neighbors("winner_solutions/true_blend_v23.csv"),
    get_neighbors("winner_solutions/true_blend_v16_500.csv"),
    get_neighbors("winner_solutions/true_blend_v15.csv"),
    get_neighbors("winner_solutions/blend_v25.csv"),
    get_neighbors("winner_solutions/true_blend_v22.csv"), # best
])

In [107]:
data = reduce_by_subm([
    get_neighbors("winner_solutions/true_blend_v25.csv"),
    get_neighbors("winner_solutions/true_blend_v24.csv"), # best
])

In [108]:
save_tracks_to_file(data, "winner_solutions/true_blend_v25.csv")

In [120]:
pd.read_csv("winner_solutions/true_blend_v25.csv", header=None).head(6)

Unnamed: 0,0
0,1 44929 47143 27655 12945 39321 45217 18617 23...
1,2 20243 25683 4190 27920 43923 267 45008 18663...
2,3 38279 36812 13587 23722 1180 566 15846 27249...
3,4 25335 21167 21241 30752 47806 25663 9738 208...
4,5 51064 16196 40623 3449 13022 53839 22657 294...
5,6 44178 39897 47611 37213 27362 46376 23198 51...
