In [1]:
import pandas as pd
import os
import numpy as np
from glob import glob
import zipfile
import tqdm
import gc
import nmslib
from collections import defaultdict, Counter
import copy
import joblib
import pickle

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [2]:
def seed_everything(seed=0):
    import random
    import os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
def extract_name(f):
    return f.name.rsplit("/", 1)[-1].split(".")[0]


def s3_objects(s3_client, bucket_name, keys, paths):
    if paths is not None:
        for path in paths:
            yield path
    else:
        for key in keys:
            s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)["Body"].read()
            yield io.BytesIO(s3_object)
            

def load_track_embeddings(s3_client=None, bucket_name=None, keys=None, paths=None):
    track_idx2embeds = {}
    for s3_object in s3_objects(s3_client, bucket_name, keys, paths):
        with zipfile.ZipFile(s3_object) as zf:
            for file in zf.namelist():
                if file.endswith(".npy"):
                    with zf.open(file) as f:
                        track_idx = int(extract_name(f))
                        embeds = np.load(f)
                        track_idx2embeds[track_idx] = embeds
    return track_idx2embeds


def load_tag_data(s3_client=None, bucket_name=None, keys=None, paths=None):
    res = {}
    for s3_object in s3_objects(s3_client, bucket_name, keys, paths):
        with zipfile.ZipFile(s3_object) as zf:
            for file in zf.namelist():
                if file.endswith("train.csv") or file.endswith("test.csv"):
                    with zf.open(file) as f:
                        res[extract_name(f)] = pd.read_csv(f)
    return res


def load_data(base_path):
    tag_data = load_tag_data(paths=[os.path.join(base_path, "data.zip")])
    track_idx2embeds = load_track_embeddings(
        paths=[
            os.path.join(base_path, "track_embeddings", f"dir_00{i}.zip")
            for i in range(1, 9)
        ],
    )
    return tag_data, track_idx2embeds

In [4]:
NUM_TAGS = 256
LENGTHS = (16, 32, 40, 64, 128, 10000)
N_NEIGHBORS = (8, 16, 32, 64, 128, 300)

In [5]:
seed_everything()

In [6]:
tag_data, track_idx2embeds = load_data("/Users/yaroslav.hnykov/Desktop/Study/VCS/YandexCUP2023/ML/RecSys/input_data/")

In [7]:
def normalize(a):
    return a / np.linalg.norm(a)


def get_embed_features(track_idx2embeds, track_idx):
    track_embeds = track_idx2embeds[track_idx]
    truncated_embeds = {
        l: normalize(track_embeds[:l].mean(axis=0)) for l in LENGTHS
    }
    length = len(track_embeds)
    return {
        "truncated_embeds": truncated_embeds,
        "length": length
    }

In [8]:
def get_track_embed_features(tag_data):
    track_embed_features = {}
    for track_idx in tqdm.tqdm(tag_data["track"]):
        track_embed_features[track_idx] = get_embed_features(track_idx2embeds, track_idx)
    return track_embed_features

In [9]:
train_embed_features = get_track_embed_features(tag_data["train"])
test_embed_features = get_track_embed_features(tag_data["test"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51134/51134 [02:00<00:00, 423.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25580/25580 [02:09<00:00, 196.78it/s]


In [10]:
del track_idx2embeds
gc.collect()

0

In [11]:
def get_flatten_dataset(df, testing=False):
    df = df.copy()
    df["tag"] = [list(map(str, range(NUM_TAGS)))] * len(df)
    df = df.explode("tag")
    if not testing:
        df["tags"] = df["tags"].str.split(",")
        df["target"] = df.apply(lambda x: str(x["tag"]) in x["tags"], axis=1).astype(int)
        df.drop(columns=["tags"], inplace=True)
    return df.to_dict(orient="records")

In [12]:
def get_tags_features(df):
    df = df.copy()
    df["tags"] = df["tags"].str.split(",")
    df["n_tags"] = df["tags"].str.len()
    df = df.explode("tags")
    tag_cnt = df.groupby("tags").size().sort_values()
    tags_features = {}
    tags_features["pop_rank"] = tag_cnt.rank(ascending=False).to_dict()
    tags_features["pop_rank_upper_1"] = (tag_cnt / tag_cnt.shift(1)).dropna().to_dict()
    tags_features["pop_rank_lower_1"] = (tag_cnt.shift(1) / tag_cnt).dropna().to_dict()
    tags_features["pop_rank_upper_2"] = (tag_cnt / tag_cnt.shift(3)).dropna().to_dict()
    tags_features["pop_rank_lower_2"] = (tag_cnt.shift(2) / tag_cnt).dropna().to_dict()
    tags_features["pop_rank_upper_3"] = (tag_cnt / tag_cnt.shift(3)).dropna().to_dict()
    tags_features["pop_rank_lower_3"] = (tag_cnt.shift(3) / tag_cnt).dropna().to_dict()
    tags_features["individuality_mean"] = df.groupby("tags")["n_tags"].mean().to_dict()
    tags_features["individuality_std"] = df.groupby("tags")["n_tags"].std().to_dict()
    tags_features["individuality_max"] = df.groupby("tags")["n_tags"].max().to_dict()
    tags_features["individuality_min"] = df.groupby("tags")["n_tags"].min().to_dict()
    return tags_features

In [13]:
def build_knn_indexes(train_embed_features):
    simpleidx2trackidx = {}
    vector_spaces = defaultdict(list)
    for simple_idx, (track_idx, vfeatures) in enumerate(train_embed_features.items()):
        simpleidx2trackidx[simple_idx] = track_idx
        for length, v in vfeatures["truncated_embeds"].items():
            vector_spaces[length].append(v)

    vector_spaces = {length: np.vstack(v) for length, v in vector_spaces.items()}
    
    knn_indexes = {}
    for length, vector_space in vector_spaces.items():
        index = nmslib.init(method="hnsw", space="negdotprod")
        index.addDataPointBatch(vector_space)
        index.createIndex({"M": 16,  "efConstruction": 100, "post": 2}, print_progress=True)
        index.setQueryTimeParams({"ef": 90})
        knn_indexes[length] = index
    
    return knn_indexes, simpleidx2trackidx

In [14]:
def find_nearest_neighbors(track_embed_features, knn_indexes, simpleidx2trackidx):
    nearest_neighbors = {}
    for track_idx, vfeatures in tqdm.tqdm(track_embed_features.items()):
        nn = {}
        for length, knn_index in knn_indexes.items():
            vector = vfeatures["truncated_embeds"][length]
            neighbors_simple_indices = knn_index.knnQuery(vector, k=300)[0]
            neighbors_track_indices = [
                simpleidx2trackidx[si] 
                for si in neighbors_simple_indices 
                if simpleidx2trackidx[si] != track_idx
            ]
            nn[length] = neighbors_track_indices
        nearest_neighbors[track_idx] = nn
    return nearest_neighbors

In [15]:
def get_track2tags(tag_data):
    df = tag_data.copy()
    df["tags"] = df["tags"].str.split(",").apply(set)
    return df.set_index("track")["tags"].to_dict()

In [16]:
def get_knn_features(nearest_neighbors, track2tags):
    knn_features = {}
    for i in N_NEIGHBORS:
        for l in LENGTHS:     
            knn_features[f"nn_{i}_l_{l}_tag_sh"] = {}
            knn_features[f"nn_{i}_l_{l}_tracks_tag_sh"] = {}
    for track_idx, lneighbors in tqdm.tqdm(nearest_neighbors.items()):
        for l, neighbors in lneighbors.items():
            for i in N_NEIGHBORS:
                cnt = Counter()
                for n in neighbors[:i]:  
                    cnt.update(track2tags[n])
                total_tags = sum(cnt.values())
                knn_features[f"nn_{i}_l_{l}_tag_sh"][track_idx] = {k: v / total_tags for k, v in cnt.items()}
                knn_features[f"nn_{i}_l_{l}_tracks_tag_sh"][track_idx] = {k: v / i for k, v in cnt.items()}
    return knn_features

In [17]:
knn_indexes, simpleidx2trackidx = build_knn_indexes(train_embed_features)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
****************************************************


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************
0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************
0%   10   20   30   40   50   60   70  

In [18]:
track2tags = get_track2tags(tag_data["train"])

In [19]:
train_nearest_neighbors = find_nearest_neighbors(train_embed_features, knn_indexes, simpleidx2trackidx)
train_knn_features = get_knn_features(train_nearest_neighbors, track2tags)

del train_nearest_neighbors
gc.collect()

In [20]:
test_nearest_neighbors = find_nearest_neighbors(test_embed_features, knn_indexes, simpleidx2trackidx)
test_knn_features = get_knn_features(test_nearest_neighbors, track2tags)

del test_nearest_neighbors
gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25580/25580 [04:21<00:00, 97.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25580/25580 [04:59<00:00, 85.41it/s]


0

In [21]:
del knn_indexes, track2tags, simpleidx2trackidx

gc.collect()

0

In [22]:
# joblib.dump(feats, "feature_names.pkl")

FEATURE_NAMES = joblib.load("feature_names.pkl")

In [23]:
def make_features(flattened_dataset, tag_data, knn_features, embed_features):
    res = []
    tags_features = get_tags_features(tag_data)
    for row in tqdm.tqdm(flattened_dataset):
        frow = row.copy()
        for feature_name in FEATURE_NAMES:
            if feature_name in tags_features:
                frow[feature_name] = tags_features[feature_name].get(row["tag"], -1)
            elif feature_name in knn_features:
                frow[feature_name] = knn_features[feature_name][row["track"]].get(row["tag"], -1)
            elif feature_name == "net_prediction":
                continue
            elif feature_name == "length":
                frow[feature_name] = embed_features[row["track"]].get("length", -1)
        res.append(frow)
    return res

In [24]:
def make_chunked_features(tag_data, knn_features, embed_features, n_chunks=10, testing=False):
    data = tag_data["test"] if testing else tag_data["train"]
    for chunk_idx, chunk_data in enumerate(
        np.array_split(data, n_chunks)
    ):
        print(f"Proccessing chunk_idx = {chunk_idx}")
        features = make_features(
            get_flatten_dataset(chunk_data, testing=testing), 
            tag_data["train"], 
            knn_features, 
            embed_features
        )
        namebase = 'X_test' if testing else 'Xy_train'
        with open(f"../second_stage/base_features/{namebase}__chunk_idx={chunk_idx}.pkl", mode="wb") as f:
            pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
%%time

make_chunked_features(tag_data, train_knn_features, train_embed_features, testing=False)

In [26]:
del train_knn_features, train_embed_features
gc.collect()

In [None]:
%%time

make_chunked_features(tag_data, test_knn_features, test_embed_features, n_chunks=5, testing=True)

Proccessing chunk_idx = 0


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1309696/1309696 [01:00<00:00, 21545.67it/s]


Proccessing chunk_idx = 1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1309696/1309696 [02:00<00:00, 10911.96it/s]


Proccessing chunk_idx = 2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1309696/1309696 [02:35<00:00, 8440.08it/s]


Proccessing chunk_idx = 3


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1309696/1309696 [01:10<00:00, 18538.24it/s]


Proccessing chunk_idx = 4


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1309696/1309696 [01:15<00:00, 17398.50it/s]


In [None]:
del test_knn_features, test_embed_features
gc.collect()