In [1]:
import pandas as pd
import os
import numpy as np
from glob import glob
import zipfile
import tqdm
import gc
import nmslib
from collections import defaultdict, Counter
import copy
import joblib
import pickle

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [2]:
NUM_TAGS = 256

In [3]:
def seed_everything(seed=0):
    import random
    import os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
def extract_name(f):
    return f.name.rsplit("/", 1)[-1].split(".")[0]


def s3_objects(s3_client, bucket_name, keys, paths):
    if paths is not None:
        for path in paths:
            yield path
    else:
        for key in keys:
            s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)["Body"].read()
            yield io.BytesIO(s3_object)
            

def load_tag_data(s3_client=None, bucket_name=None, keys=None, paths=None):
    res = {}
    for s3_object in s3_objects(s3_client, bucket_name, keys, paths):
        with zipfile.ZipFile(s3_object) as zf:
            for file in zf.namelist():
                if file.endswith("train.csv") or file.endswith("test.csv"):
                    with zf.open(file) as f:
                        res[extract_name(f)] = pd.read_csv(f)
    return res

In [5]:
cfg = {
    "data_path": "/Users/yaroslav.hnykov/Desktop/Study/VCS/YandexCUP2023/ML/RecSys/input_data/",
    "n_splits": 3,
    "seed": 11,
    "version": 6,
    "score_threshold": 0.0
}

In [6]:
tag_data = load_tag_data(paths=[os.path.join(cfg["data_path"], "data.zip")])

In [7]:
seed_everything(cfg["seed"])

In [8]:
def get_info(name):
    info = name.split("__")
    cfg_version = int(info[1].split("=")[1])
    score = float(info[4].split("=")[1].rsplit(".", 1)[0])
    fold_idx = int(info[2].split("=")[1])
    return {
        "cfg_version": cfg_version,
        "score": score,
        "fold_idx": fold_idx,
    }


def read_net_predictions(cfg_version, score_threshold=0, fold_idx=tuple(range(10000)), testing=False):
    path = "../predictions_test/" if testing else "../predictions_val/"
    predictions = None
    total = 0
    for name in os.listdir(path):
        info = get_info(name)
        if (
            (info["cfg_version"] == cfg_version) 
            and (info["score"] > score_threshold) 
            and (info["fold_idx"] in fold_idx)
        ):
            print(f"Processing {path + name}")
            try:
                a = pd.read_csv(path + name).sort_values(by=["track"])
            except ParserError:
                continue
            av = np.asarray(a["prediction"].apply(eval).apply(list).tolist())
            if predictions is None:
                predictions = av
            else:
                predictions += av
            total += 1
    predictions_df = pd.DataFrame(
        [
            {"track": track, "prediction": ",".join([str(p) for p in probs])}
            for track, probs in zip(a["track"], predictions / total)
        ]
    )
    predictions_df["prediction"] = predictions_df["prediction"].str.split(",")
    predictions_df = predictions_df.explode("prediction")
    predictions_df["tag"] = list(map(str, range(NUM_TAGS))) * predictions_df["track"].nunique()
    res = predictions_df.astype({"track": int, "tag": str, "prediction": float})
    return res

In [17]:
def get_net_features(tag_data, cfg):
    print("Train features processing ...")
    train_net_features = pd.concat([
        read_net_predictions(cfg["version"], cfg["score_threshold"], fold_idx=[fold_idx], testing=False)
        for fold_idx in range(cfg["n_splits"])
    ], axis=0).reset_index(drop=True)
    
    print("Test features processing ...")
    test_net_features = (
        read_net_predictions(cfg["version"], cfg["score_threshold"], testing=True).reset_index(drop=True)
    )
    return train_net_features, test_net_features

In [18]:
net_features_train, net_features_test = get_net_features(tag_data, cfg)

Train features processing ...
Processing ../predictions_val/prediction__cfg=6__fold_idx=0__epoch=11__score=0.23483.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=0__epoch=12__score=0.23398.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=0__epoch=9__score=0.23714.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=0__epoch=10__score=0.23640.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=1__epoch=11__score=0.23857.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=1__epoch=10__score=0.23721.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=1__epoch=9__score=0.23753.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=1__epoch=8__score=0.23623.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=2__epoch=11__score=0.24056.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=2__epoch=9__score=0.24045.csv
Processing ../predictions_val/prediction__cfg=6__fold_idx=2__epoch=10__score=0.24262

In [21]:
net_features_train.to_parquet(f"../second_stage/net_features/train_net_features_cfg={cfg['version']}", index=False)
net_features_test.to_parquet(f"../second_stage/net_features/test_net_features_cfg={cfg['version']}", index=False)