In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import sys
from pathlib import Path
sys.path.append("../")

In [6]:
def init_cfg(cfg_path):
    from hydra import compose, initialize
    from omegaconf import OmegaConf

    with initialize(version_base=None, config_path=str(Path(cfg_path).parent)):
        try:
            cfg = compose(config_name="config", overrides=["+read_filtered_clique2versions=null"])
        except:
            cfg = compose(config_name="config", overrides=["read_filtered_clique2versions=null"])

    checkpoints_folder = "artifacts_" + str(Path(cfg_path).parent.parent.name) + "/model_checkpoints"
    cfg["path_to_fold_checkpoints"] = checkpoints_folder
    cfg["read_filtered_clique2versions"] = None
    cfg["environment"]["device"] = "cuda:2"
    return cfg

In [7]:
init_cfg("final_artifacts/hgnetv2_b5_drop_cliques_test_0_6_6folds01_11_06/hydra/config/")

MissingConfigException: Primary config directory not found.
Check that the config directory '/home/yskhnykov/yandex_cup/final_artifacts/hgnetv2_b5_drop_cliques_test_0_6_6folds01_11_06/hydra/config' exists and readable

In [5]:
import gc
import logging

import hydra
import numpy as np
import torch
import torchinfo
from dotenv import load_dotenv
from hydra.utils import call, instantiate
from omegaconf import DictConfig, OmegaConf
from torch.cuda.amp import GradScaler

from csi.base.utils import init_model, seed_everything
from csi.submission import make_submission
from csi.training.data.dataset import filter_tracks
from csi.training.loop.loop import train_one_epoch
from csi.training.loop.utils import (
    clean_old_content,
    freeze_layers,
    load_fold_checkpoint,
    save_checkpoint,
)
from csi.training.metrics.ndcg import compute_ndcg
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

from csi.base.model.predict import predict
from csi.base.utils import batch_to_device
from csi.training.loop.utils import split_by_batch_size
from tqdm import tqdm
from collections import defaultdict

logger = logging.getLogger(__name__)
logger.info = print

In [6]:
seed_everything(cfg.environment.seed)

logger.info("Reading clique2tracks")
clique2tracks = call(
    cfg.read_clique2versions,
    _convert_="partial",
)

filtered_clique2tracks = call(
    cfg.read_filtered_clique2versions,
    _convert_="partial",
)

cliques_splits = call(cfg.split_cliques, clique2tracks, _convert_="partial")
metrics = []
# clean_old_content(cfg.training.checkpoint_dir)
clique2similarities = {}
for fold, (
    (train_track2clique, train_clique2tracks),
    (val_track2clique, val_clique2tracks),
) in enumerate(cliques_splits):
    if filtered_clique2tracks is not None:
        train_track2clique, train_clique2tracks = filter_tracks(
            filtered_clique2tracks, train_clique2tracks
        )
    val_dataset = call(
        cfg.val_data.dataset,
        tracks_ids=list(val_track2clique),
        track2clique=val_track2clique,
        clique2tracks=val_clique2tracks,
        _convert_="partial",
    )

    val_loader = instantiate(cfg.val_data.dataloader, val_dataset, _convert_="partial")

    model = init_model(cfg).to(cfg.environment.device)
    if fold == 0:
        logger.info(f"Model arhitecture:\n{model}")
        logger.info(f"Model summary:\n{torchinfo.summary(model)}")

    if cfg.path_to_fold_checkpoints is not None:
        model = load_fold_checkpoint(model, cfg.path_to_fold_checkpoints, fold)

    if cfg.freeze_backbone_num_layers is not None:
        model = freeze_layers(model, cfg.freeze_backbone_num_layers)


    embeddings, cliques, track_ids, num_other_tracks_in_clique = [], [], [], []

    for batch in tqdm(val_loader):
        batch = batch_to_device(batch, cfg.environment.device)
        outs = predict(model, batch)
        embs = outs["embedding"]
        embeddings.append(embs)
        cliques.append(batch["clique"].reshape(-1, 1))
        num_other_tracks_in_clique.append(batch["num_other_tracks_in_clique"].reshape(-1, 1))
    
    embeddings = torch.vstack(embeddings).detach().cpu().numpy()
    cliques = torch.vstack(cliques).detach().cpu().numpy()
    num_other_tracks_in_clique = torch.vstack(num_other_tracks_in_clique).detach().cpu().numpy()

    clique2tracks = defaultdict(list)
    for i, clique in enumerate(cliques.flatten()):
        clique2tracks[int(clique)].append(i)
    clique2tracks = {k: torch.LongTensor(sorted(v)) for k, v in clique2tracks.items()}

    
    for clique, tracks in clique2tracks.items():
        e = embeddings[tracks] / np.linalg.norm(embeddings[tracks], ord=2, axis=1, keepdims=True)
        similarities = e @ e.T
        np.fill_diagonal(similarities, 0.0)
        clique2similarities[clique] = similarities


# test_dataset = call(
#     cfg.test_data.dataset,
#     tracks_ids=np.load(cfg.test_data.test_ids_path),
#     track2clique=None,
#     clique2tracks=None,
#     _convert_="partial",
# )
# test_loader = instantiate(cfg.test_data.dataloader, test_dataset, _convert_="partial")

# make_submission(cfg, model, test_loader)

# logger.info("Pushing to mlflow")
# call(cfg.mlflow.push, cfg, metrics)

Reading clique2tracks
Model arhitecture:
TimmModelWOLayerNorm(
  (model): HighPerfGpuNet(
    (stem): StemV2(
      (stem1): ConvBNAct(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU()
        (lab): Identity()
      )
      (stem2a): ConvBNAct(
        (conv): Conv2d(32, 16, kernel_size=(2, 2), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU()
        (lab): Identity()
      )
      (stem2b): ConvBNAct(
        (conv): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act): ReLU()
        (lab): Identity()
      )
      (stem3): ConvBNAct(
        (conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bia

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270/270 [01:04<00:00,  4.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:54<00:00,  4.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 264/264 [00:56<00:00,  4.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 277/277 [00:58<00:00,  4.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 262/262 [00:5

In [7]:
clique2similarities

41616

In [50]:
clique2similarities[14955].nondiag()

AttributeError: 'numpy.ndarray' object has no attribute 'nondiag'

In [85]:
clique2min_sim = {}
for clique, similarities in clique2similarities.items():
    clique2min_sim[clique] = similarities.min(), similarities.mean(), similarities.max()

bad_cliques = []
for clique, st in clique2min_sim.items():
    min_, mean, max_ = st
    if mean < 0.38:
        bad_cliques.append(clique)

In [86]:
len(bad_cliques)

2427

In [87]:
import pandas as pd
import ast
def read_clique2versions(clique2versions_path: str) -> dict[int, list[int]]:
    import ast
    df = pd.read_csv(clique2versions_path, sep="\t", converters={"versions": ast.literal_eval})
    clique2tracks = df.set_index("clique")["versions"].to_dict()
    return clique2tracks

In [88]:
df = pd.read_csv("/home/yskhnykov/yandex_cup/data/raw/cliques2versions_cleaned_axis_0_1_3.tsv", sep="\t", converters={"versions": ast.literal_eval})

In [89]:
df[~df["clique"].isin(bad_cliques)].to_csv("/home/yskhnykov/yandex_cup/data/raw/cliques2versions_drop_cliques_2_5k.tsv", sep="\t", index=False)

In [5]:
import pandas as pd
import ast

In [6]:
f = pd.read_csv("/home/yskhnykov/yandex_cup/data/raw/cliques2versions_drop_cliques_2_5k.tsv", sep="\t", converters={"versions": ast.literal_eval})

In [7]:
f

Unnamed: 0,clique,versions
0,39475,"[343223, 361210, 114472, 134744, 271362, 30747..."
1,20077,"[343224, 350590, 170706, 266043, 314556, 30764..."
2,22290,"[343225, 343986, 344624, 345116, 345312, 33796..."
3,17098,"[343226, 220430]"
4,41075,"[343228, 182973]"
...,...,...
39177,20120,"[103390, 71338]"
39178,16898,"[70624, 76088]"
39179,31616,"[70632, 76025]"
39180,40137,"[71481, 73797]"


In [60]:
len(clique2min_sim)

41616