In [1]:
import numpy as np
import pandas as pd
import cudf
from collections import Counter
import itertools
import gc
from multiprocessing import Pool

from pathlib import Path
from contextlib import contextmanager
import math
import os
import subprocess
import sys
import time
import psutil
import torch

import polars as pl
from gensim.models import Word2Vec, KeyedVectors
import hashlib
from cuml.neighbors import NearestNeighbors
import cupy as cp

cudf.set_option("default_integer_bitwidth", 32)
cudf.set_option("default_float_bitwidth", 32)
os.environ["PYTHONHASHSEED"] = str(42)

In [2]:
class Config:
    EXP_ID = "012"
    VALIDATION = True
    INPUT_DIR = Path("../../input/train_valid") if VALIDATION else Path("../../input")
    OUTPUT_DIR = Path(f"../../output/exp{EXP_ID}")
    CANDIDATE_FEATURE_DIR = OUTPUT_DIR / "cv_candidate_feature" if VALIDATION else OUTPUT_DIR / "candidate_feature"
    RERANKING_FEATURE_DIR = OUTPUT_DIR / "cv_reranking_feature" if VALIDATION else OUTPUT_DIR / "reranking_feature"
    DISK_PIECES = 4


Path.mkdir(Config.OUTPUT_DIR, exist_ok=True)
Path.mkdir(Config.CANDIDATE_FEATURE_DIR, exist_ok=True)
Path.mkdir(Config.RERANKING_FEATURE_DIR, exist_ok=True)

In [3]:
# =================================
# Utils
# =================================
def get_input_data(input_dir: Path, phase: str):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = cudf.read_parquet(path)
        chunk["session"] = chunk["session"].astype("int32")
        chunk["aid"] = chunk["aid"].astype("int32")
        chunk["ts"] = (chunk["ts"] / 1000).astype("int32")
        chunk["type"] = chunk["type"].map(type_labels).astype("int8")
        dfs.append(chunk)
        
    del chunk
    gc.collect()
    
    return cudf.concat(dfs, axis=0, ignore_index=True)


def get_pl_input_data(input_dir: Path, phase: str):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = pl.read_parquet(path)
        chunk = chunk.with_columns([
            pl.col("session").cast(pl.Int32),
            pl.col("aid").cast(pl.Int32),
            (pl.col("ts") / 1000).cast(pl.Int32),
            pl.col("type").apply(lambda x: type_labels[x]).cast(pl.Int32)
        ])
        dfs.append(chunk)
    
    return pl.concat(dfs)


def get_gpu_memory(cmd_path="nvidia-smi",
                   target_properties=("memory.total", "memory.used")):
    """
    ref: https://www.12-technology.com/2022/01/pythongpu.html
    Returns
    -------
    gpu_total : ndarray,  "memory.total"
    gpu_used: ndarray, "memory.used"
    """

    # format option
    format_option = "--format=csv,noheader,nounits"

    cmd = '%s --query-gpu=%s %s' % (cmd_path, ','.join(target_properties), format_option)

    # Command execution in sub-processes
    cmd_res = subprocess.check_output(cmd, shell=True)

    gpu_lines = cmd_res.decode().split('\n')[0].split(', ')

    gpu_total = int(gpu_lines[0]) / 1024
    gpu_used = int(gpu_lines[1]) / 1024

    gpu_total = np.round(gpu_used, 1)
    gpu_used = np.round(gpu_used, 1)
    return gpu_total, gpu_used


class Trace():
    cuda = torch.cuda.is_available()

    @contextmanager
    def timer(self, title):
        t0 = time.time()
        p = psutil.Process(os.getpid())
        cpu_m0 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m0 = get_gpu_memory()[0]
        yield
        cpu_m1 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m1 = get_gpu_memory()[0]

        cpu_delta = cpu_m1 - cpu_m0
        if self.cuda: gpu_delta = gpu_m1 - gpu_m0

        cpu_sign = '+' if cpu_delta >= 0 else '-'
        cpu_delta = math.fabs(cpu_delta)

        if self.cuda: gpu_sign = '+' if gpu_delta >= 0 else '-'
        if self.cuda: gpu_delta = math.fabs(gpu_delta)

        cpu_message = f'{cpu_m1:.1f}GB({cpu_sign}{cpu_delta:.1f}GB)'
        if self.cuda: gpu_message = f'{gpu_m1:.1f}GB({gpu_sign}{gpu_delta:.1f}GB)'

        if self.cuda:
            message = f"[cpu: {cpu_message}, gpu: {gpu_message}: {time.time() - t0:.1f}sec] {title} "
        else:
            message = f"[cpu: {cpu_message}: {time.time() - t0:.1f}sec] {title} "

        print(message, file=sys.stderr)
        
trace = Trace()

In [4]:
# =================================
# Candidate features
# =================================
def read_file(f):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    df = cudf.read_parquet(f)
    df["session"] = df["session"].astype("int32")
    df["aid"] = df["aid"].astype("int32")
    df["ts"] = (df["ts"] / 1000).astype("int32")
    df["type"] = df["type"].map(type_labels).astype("int8")
    
    return df


def get_type_weighted_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    type_weight = {0: 1, 1: 6, 2: 3}
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df["n"] = df.groupby("session").cumcount()
                df = df.loc[df["n"] < 30].drop("n",axis=1)
                # CREATE PAIRS
                df = df.merge(df, on="session")
                df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part * SIZE) & (df.aid_x < (part+1) * SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = df.type_y.map(type_weight)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(["aid_x", "wgt"], ascending=[True, False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp["n"] = tmp.groupby("aid_x").aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 15].drop("n", axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_buy2buy_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 1
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.loc[df["type"].isin([1, 2])] # ONLY WANT CARTS AND ORDERS
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)] # 14days
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = 1
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_buy2buy_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_clicks_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', "ts_x", 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df["wgt"] = 1 + 3 * (df["ts_x"] - 1659304800) / (1662328791 - 1659304800)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 20].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{part}.pqt")
        
    del tmp
    gc.collect()

In [None]:
with trace.timer("load all data"):
    train_df = get_pl_input_data(Config.INPUT_DIR, "train")
    test_df = get_pl_input_data(Config.INPUT_DIR, "test")
    whole_df = pl.concat([train_df, test_df])
    # whole_df = whole_df.with_column((pl.col("aid").cast(str) + "_" +  pl.col("type").cast(str)).alias("aid_type"))
    
    del train_df, test_df
    gc.collect()

In [None]:
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)


def get_item2vec(input_df: pl.DataFrame, type_label: int):
    corpus = input_df.filter(pl.col("type") == type_label).groupby(["session"]).agg(pl.col("aid"))["aid"].to_list()
    item2vec = Word2Vec(
        sentences=corpus,
        vector_size=100,
        window=20,
        min_count=1,
        sg=1,
        hs=0,
        hashfxn=hashfxn,
        epochs=300,
        seed=42,
        workers=-1
    )
    item2vec.wv.save(str(Config.CANDIDATE_FEATURE_DIR / f"item2vec_{type_label}.wordvectors"))
    
    del corpus, item2vec
    gc.collect()
    
    
# def get_item2vec(input_df: pl.DataFrame):
#     corpus = input_df.groupby(["session"]).agg(pl.col("aid_type"))["aid_type"].to_list()
#     item2vec = Word2Vec(
#         sentences=corpus,
#         vector_size=100,
#         window=20,
#         min_count=1,
#         sg=1,
#         hs=0,
#         hashfxn=hashfxn,
#         epochs=100,
#         seed=42,
#         workers=-1
#     )
#     item2vec.wv.save(str(Config.CANDIDATE_FEATURE_DIR / f"item2vec.wordvectors"))
    
#     del corpus, item2vec
#     gc.collect()
    
    
# with trace.timer("item2vec"):
#     get_item2vec(whole_df)

In [None]:
with trace.timer("clicks item2vec"):
    get_item2vec(whole_df, 0)

In [None]:
with trace.timer("carts item2vec"):
    get_item2vec(whole_df, 1)

In [None]:
with trace.timer("orders item2vec"):
    get_item2vec(whole_df, 2)

In [None]:
with trace.timer("cart_co_matirx"):
    get_type_weighted_co_visitation_matrix()

In [None]:
with trace.timer("buy2buy_co_matirx"):
    get_buy2buy_co_visitation_matrix()

In [None]:
with trace.timer("clicks_co_matirx"):
    get_clicks_co_visitation_matrix()

### Candidates

In [5]:
type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def parquet2dict(path: Path) -> dict:
    df = pd.read_parquet(path)
    return df.groupby(["aid_x"])["aid_y"].apply(list).to_dict()
    # return df.groupby(["aid_x"]).parallel_apply(lambda x: x["aid_y"].to_list()).to_dict()

    
def suggest_clicks(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter()
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid for aid, cnt in Counter(aids2).most_common(20) if aid not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST CLICKS
    aids4 = [aid for aid in list(top_clicks) if aid not in result]
    return result + aids4[:20 - len(result)]


def suggest_carts(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==0) | (df['type']==1)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST ORDERS
    aids4 = [aid for aid in list(top_carts) if aid not in result]
    return result + aids4[:20 - len(result)]


def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==1) | (df['type']==2)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_aids if aid in top_15_carts2orders]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST ORDERS
    aids4 = [aid for aid in list(top_orders) if aid not in result]
    return result + aids4[:20 - len(result)]

In [6]:
# type_weight_multipliers = {0: 1, 1: 6, 2: 3}
# N_CORES = psutil.cpu_count()     # Available CPU cores


# def df_parallelize_run(func, t_split):
    
#     num_cores = np.min([N_CORES, len(t_split)])
#     pool = Pool(num_cores)
#     df = pool.map(func, t_split)
#     pool.close()
#     pool.join()
    
#     return df


# def suggest_clicks(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     # df = df.loc[(df['type']==0)]
#     # unique_clicks = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_clicks = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [0]][::-1]))
    
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in click_session2index:
#         # USE item2vec
#         aids2 = index2key[click_indices[click_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [0, 1, 2]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_aids]    
#         result = unique_aids + top_aids2[:20 - len(unique_aids)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_clicks)[:20 - len(result)]


# def suggest_carts(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     df = df.loc[(df['type'] == 0) | (df['type'] == 1)]
#     unique_carts = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_carts = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [1]][::-1]))
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in cart_session2index:
#         # USE item2vec
#         aids2 = index2key[cart_indices[cart_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [0, 1]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_carts]    
#         result = unique_carts + top_aids2[:20 - len(unique_carts)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_carts)[:20 - len(result)]


# def suggest_orders(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     df = df.loc[(df['type'] == 1) | (df['type'] == 2)]
#     unique_orders = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_orders = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [2]][::-1]))
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in order_session2index:
#         # USE item2vec
#         aids2 = index2key[order_indices[order_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [1, 2]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_orders]    
#         result = unique_orders + top_aids2[:20 - len(unique_orders)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_orders)[:20 - len(result)]

In [7]:
# def get_similar_items_by_item2vec(item2vec_model, session_type):
#     input_df = get_input_data(Config.INPUT_DIR, "test")
#     # last 30 events per session
#     input_df = input_df.sort_values(by=["session", "ts"], ascending=[True, False])
#     input_df["n"] = input_df.groupby(["session"]).cumcount()
#     input_df = input_df.loc[input_df["n"] < 30].reset_index(drop=True)
#     # item2vecに含まれるaidにfintering
#     input_df["aid_type"] = input_df["aid"].astype(str) + "_" + input_df["type"].astype(str)
#     input_df = input_df.loc[input_df["aid_type"].isin(item2vec_model.index_to_key)].reset_index(drop=True)
#     # input_df = input_df.loc[input_df["aid"].isin(item2vec_model.index_to_key)].reset_index(drop=True)

#     # sessionにtype == clickがないイベントログ
#     input_df["is_type"] = input_df["type"].isin(session_type).astype("int8")
#     input_df["count_type"] = input_df.groupby("session")["is_type"].transform("sum")

#     output_df = input_df.loc[(input_df["type"].isin(session_type)) & (input_df["count_type"] > 0)]
#     output_df = cudf.concat([output_df, input_df.loc[input_df["count_type"] == 0]], ignore_index=True)
#     output_df = output_df.sort_values(by=["session", "ts"]).reset_index(drop=True).drop(columns=["is_type", "count_type"])
    
#     session_idx = np.cumsum(output_df.groupby(["session"], sort=True).size().to_numpy())
#     session2index = {k: v for v, k in enumerate(sorted(output_df["session"].unique().to_pandas().to_list()))}
    
#     # session embedding
#     session_embeddings = item2vec_model[output_df["aid_type"].to_numpy()]
#     # session_embeddings = item2vec_model[output_df["aid"].to_numpy()]
#     session_embeddings = np.split(session_embeddings, session_idx[:-1])
#     session_embeddings = np.array([np.mean(i, axis=0) for i in session_embeddings])
    
#     # most similars
#     model = NearestNeighbors(n_neighbors=20, metric="cosine", output_type="numpy")
#     model.fit(item2vec_model.vectors)
    
#     _, indices = model.kneighbors(session_embeddings)
    
#     del input_df, output_df, session_embeddings, model
#     gc.collect()
    
#     return session2index, indices

In [8]:
# def get_similar_items(word2vec):
#     X = cp.array(word2vec.vectors)
#     model = NearestNeighbors(n_neighbors=21, metric="cosine", output_type="numpy")
#     model.fit(X)
    
#     _, indices = model.kneighbors(X)
    
#     del X, model
#     gc.collect()
    
#     return indices

In [10]:
with trace.timer("load test set"):
    test_df = get_input_data(Config.INPUT_DIR, "test").to_pandas()

[cpu: 6.7GB(+0.1GB), gpu: 1.3GB(+0.0GB): 3.0sec] load test set 


In [11]:
test_df = test_df[test_df["session"].isin(test_df["session"].unique()[:10000])].reset_index(drop=True)
test_df

Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0
...,...,...,...,...
58065,11108530,680375,1661128123,0
58066,11108530,108125,1661128171,0
58067,11108531,952464,1661127918,0
58068,11108531,248689,1661280373,0


In [8]:
with trace.timer("load candidate feature"):
    # type_weighted
    top_15_carts2orders = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_carts_orders_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_15_carts2orders.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{i}.pqt"))
    
    # buy2buy
    top_15_buy2buy = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_buy2buy_0.pqt")
    
    # clicks
    top_20_clicks = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_20_clicks_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_20_clicks.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{i}.pqt"))
        
    top_clicks = test_df.loc[test_df["type"] == 0, "aid"].value_counts().index.to_numpy()[:20]
    top_carts = test_df.loc[test_df["type"] == 1, "aid"].value_counts().index.to_numpy()[:20]
    top_orders = test_df.loc[test_df["type"] == 2, "aid"].value_counts().index.to_numpy()[:20]
    
#     # clicks
#     click_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_0.wordvectors"), mmap="r")
#     click_index2key = np.array(click_model.index_to_key)
#     click_indices = get_similar_items(click_model)
    
#     # carts
#     cart_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_1.wordvectors"), mmap="r")
#     cart_index2key = np.array(cart_model.index_to_key)
#     cart_indices = get_similar_items(cart_model)
    
#     # orders
#     order_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_2.wordvectors"), mmap="r")
#     order_index2key = np.array(order_model.index_to_key)
#     order_indices = get_similar_items(order_model)
    
    # all
    # item2vec = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec.wordvectors"), mmap="r")
    # index2key = np.array(item2vec.index_to_key)

[cpu: 6.6GB(+3.6GB), gpu: 1.3GB(+0.0GB): 81.3sec] load candidate feature 


In [11]:
# with trace.timer("load test set"):
#     temp = [group for name, group in test_df.sort_values(["session", "ts"]).groupby(["session"])]
#     test_list = [[i["session"].iloc[0], i["aid"].to_list(), i["type"].to_list()] for i in temp]
    
#     del temp, test_df
#     gc.collect()

In [12]:
with trace.timer("click inference"):
    # click_session2index, click_indices = get_similar_items_by_item2vec(item2vec, [0, 1, 2])
    # temp = df_parallelize_run(suggest_clicks, test_list)
    # pred_clicks = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_clicks)
    # pred_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(lambda x: suggest_clicks(x))
    
    #del click_session2index, click_indices, click_model, click_index2key #temp
    gc.collect()

[cpu: 6.5GB(+0.0GB), gpu: 1.3GB(+0.0GB): 6.8sec] click inference 


In [13]:
with trace.timer("cart inference"):
    # cart_session2index, cart_indices = get_similar_items_by_item2vec(item2vec, [0, 1])
    # temp = df_parallelize_run(suggest_carts, test_list)
    # pred_carts = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_carts)
    
    #del cart_session2index, cart_indices, #cart_model, cart_index2key #temp
    gc.collect()

[cpu: 6.5GB(+0.0GB), gpu: 1.3GB(+0.0GB): 10.2sec] cart inference 


In [14]:
with trace.timer("order inference"):
    # order_session2index, order_indices = get_similar_items_by_item2vec(item2vec, [1, 2])
    # temp = df_parallelize_run(suggest_orders, test_list)
    # pred_orders = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_orders = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_buys)
    
    #del order_session2index, order_indices, #order_model, order_index2key #temp
    gc.collect()

[cpu: 6.5GB(+0.0GB), gpu: 1.3GB(+0.0GB): 9.4sec] order inference 


In [15]:
test_df = cudf.from_pandas(test_df)

candidate_df = cudf.from_pandas(pred_carts).explode().astype("int32").reset_index(name="aid")
candidate_df

Unnamed: 0,session,aid
0,11098528,11830
1,11098528,588923
2,11098528,1732105
3,11098528,571762
4,11098528,884502
...,...,...
199995,11108532,1758681
199996,11108532,1674003
199997,11108532,356615
199998,11108532,507852


### item features

In [16]:
def get_co_carts_orders_aggregates():
    """
    item-based CFのカウント
    """
    filepath = list(Config.CANDIDATE_FEATURE_DIR.glob("top_15_carts_orders_*.pqt"))
    for i, f in enumerate(filepath):
        tmp = cudf.read_parquet(f)
        
        if i == 0:
            df = tmp
        else:
            df = df.add(tmp, fill_value=0)
    
    # aggreagte
    df = df.groupby(["aid_y"]).agg({"aid_y": ["count"]}).reset_index()
    df.columns = ["aid", "count"]
    
    return df


item_feature = get_co_carts_orders_aggregates()
item_feature

Unnamed: 0,aid,count
0,175797,30
1,326515,5
2,1760273,23
3,1369281,9
4,2179603,2
...,...,...
2305872,855699,3
2305873,456376,2
2305874,471801,9
2305875,1390306,5


In [24]:
def get_aid_describe(input_df):
    """
    各itemの過去実績
    """
    feature = input_df.groupby(["aid"]).agg({"aid": "count", "session": "nunique", "ts": "last"})
    feature.columns = ["count", "n_session", "last_ts"]
    # count events each type
    for i, name in enumerate(["click", "cart", "order"]):
        tmp = input_df.loc[input_df["type"] == i].groupby(["aid"]).agg({"aid": "count"})
        tmp.columns = [f"count_{name}"]
        
        feature = cudf.concat([feature, tmp], axis=1)
        
        del tmp
        gc.collect()
        
    return feature.fillna(0).reset_index()


item_feature = get_aid_describe(test_df)
item_feature

Unnamed: 0,aid,count,n_session,last_ts,count_click,count_cart,count_order
0,3,1,1,1661120947,1,0,0
1,219,1,1,1661170019,1,0,0
2,240,2,2,1661180457,2,0,0
3,247,4,2,1661142855,4,0,0
4,316,1,1,1661464855,1,0,0
...,...,...,...,...,...,...,...
30278,1855448,1,1,1661608631,1,0,0
30279,1855495,1,1,1661374858,1,0,0
30280,1855500,3,3,1661125503,3,0,0
30281,1855508,2,2,1661185344,1,1,0


### session features

In [19]:
def get_session_aggregates(input_df):
    session_feature = input_df.groupby(["session"]).agg({"session": "count", "aid": "nunique"})
    session_feature.columns = ["count_event", "count_aid"]
    # count events each type
    for i, name in enumerate(["click", "cart", "order"]):
        tmp = input_df.loc[input_df["type"] == i].groupby(["session"]).agg({"session": "count"})
        tmp.columns = [f"count_{name}"]
        
        session_feature = cudf.concat([session_feature, tmp], axis=1)
        
        del tmp
        gc.collect()

    return session_feature.fillna(0).reset_index()


session_feature = get_session_aggregates(test_df)
session_feature

Unnamed: 0,session,count_event,count_aid,count_click,count_cart,count_order
0,11098528,1,1,1,0,0
1,11098529,1,1,1,0,0
2,11098530,6,2,5,1,0
3,11098531,24,11,20,0,4
4,11098532,2,2,2,0,0
...,...,...,...,...,...,...
9995,11108528,40,32,34,6,0
9996,11108529,5,3,5,0,0
9997,11108530,8,5,8,0,0
9998,11108531,2,2,2,0,0


### create train set

In [18]:
new_candidate_df = candidate_df.merge(item_feature, on="aid", how="left").astype("int32").fillna(-1)
new_candidate_df = new_candidate_df.merge(session_feature, on="session", how="left").astype("int32").fillna(-1)
new_candidate_df

Unnamed: 0,session,aid,count,count_event,count_aid,count_click,count_cart,count_order
0,11099107,125597,637,1,1,1,-1,-1
1,11099107,440855,290,1,1,1,-1,-1
2,11099107,533638,110,1,1,1,-1,-1
3,11099107,362150,220,1,1,1,-1,-1
4,11099107,1645990,1858,1,1,1,-1,-1
...,...,...,...,...,...,...,...,...
199995,11108387,736002,40,17,12,17,-1,-1
199996,11108387,341579,225,17,12,17,-1,-1
199997,11108387,1231891,655,17,12,17,-1,-1
199998,11108387,795782,65,17,12,17,-1,-1


### add target label

In [19]:
target = cudf.read_parquet(Config.INPUT_DIR / "test_labels.parquet")
target = target.loc[target["type"] == "carts"]
target = target.explode(column="ground_truth").drop(columns=["type"]).astype("int32").reset_index(drop=True)
target.columns = ["session", "aid"]
target["cart"] = 1
target

Unnamed: 0,session,aid,cart
0,11098528,1199737,1
1,11098533,108676,1
2,11098533,1406660,1
3,11098533,988295,1
4,11098533,1118792,1
...,...,...,...
569692,12899732,1126169,1
569693,12899739,301163,1
569694,12899739,1379999,1
569695,12899757,1677695,1


In [20]:
new_candidate_df = new_candidate_df.merge(target, on=["session", "aid"], how="left").fillna(0)
new_candidate_df

Unnamed: 0,session,aid,count,count_event,count_aid,count_click,count_cart,count_order,cart
0,11098992,41302,78,8,6,7,1,-1,0
1,11098992,706006,77,8,6,7,1,-1,0
2,11098992,728827,149,8,6,7,1,-1,0
3,11098992,64104,242,8,6,7,1,-1,0
4,11098992,1010861,43,8,6,7,1,-1,0
...,...,...,...,...,...,...,...,...,...
199995,11108502,1845463,219,2,2,2,-1,-1,0
199996,11108502,1112740,387,2,2,2,-1,-1,0
199997,11108501,573604,56,1,1,1,-1,-1,0
199998,11108502,1650562,165,2,2,2,-1,-1,0


### training

In [32]:
import xgboost as xgb
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold

feature_cols = ['count', 'count_event', 'count_aid', 'count_click', 'count_cart', 'count_order']

with trace.timer("training"):
    cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(cv.split(new_candidate_df.to_numpy(), new_candidate_df["cart"].to_numpy(), groups=new_candidate_df["session"].to_numpy())):
        X_train = new_candidate_df.loc[train_idx, feature_cols]
        y_train = new_candidate_df.loc[train_idx, "cart"]
        X_valid = new_candidate_df.loc[valid_idx, feature_cols]
        y_valid = new_candidate_df.loc[valid_idx, "cart"]

        dtrain = xgb.DMatrix(X_train, y_train, group=[20] * (len(train_idx) // 20))
        dvalid = xgb.DMatrix(X_valid, y_valid, group=[20] * (len(valid_idx) // 20))

        xgb_parms = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
        model = xgb.train(
            xgb_parms, 
            dtrain=dtrain,
            evals=[(dtrain,'train'), (dvalid,'valid')],
            num_boost_round=1000,
            verbose_eval=100
        )

[0]	train-map:0.94065	valid-map:0.93144
[100]	train-map:0.96386	valid-map:0.91589
[200]	train-map:0.97261	valid-map:0.91390
[300]	train-map:0.97771	valid-map:0.91248
[400]	train-map:0.97989	valid-map:0.91271
[500]	train-map:0.98203	valid-map:0.91241
[600]	train-map:0.98263	valid-map:0.91198
[700]	train-map:0.98290	valid-map:0.91170
[800]	train-map:0.98307	valid-map:0.91248
[900]	train-map:0.98343	valid-map:0.91208
[999]	train-map:0.98346	valid-map:0.91221
[0]	train-map:0.94466	valid-map:0.93761
[100]	train-map:0.96253	valid-map:0.91775
[200]	train-map:0.97291	valid-map:0.91640
[300]	train-map:0.97788	valid-map:0.91390
[400]	train-map:0.98038	valid-map:0.91400
[500]	train-map:0.98208	valid-map:0.91408
[600]	train-map:0.98321	valid-map:0.91330
[700]	train-map:0.98398	valid-map:0.91361
[800]	train-map:0.98434	valid-map:0.91381
[900]	train-map:0.98482	valid-map:0.91367
[999]	train-map:0.98523	valid-map:0.91368
[0]	train-map:0.93966	valid-map:0.92431
[100]	train-map:0.96565	valid-map:0.9119

[cpu: 7.0GB(+0.0GB), gpu: 1.3GB(+0.0GB): 21.7sec] training 


### validation score

In [None]:
with trace.timer("make pred_df"):
    pred_clicks_df = pd.DataFrame(pred_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
    pred_carts_df = pd.DataFrame(pred_carts.add_suffix("_carts"), columns=["labels"]).reset_index()
    pred_orders_df = pd.DataFrame(pred_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
    pred_df = pd.concat([pred_clicks_df, pred_carts_df, pred_orders_df], axis=0).reset_index(drop=True)
    pred_df.columns = ["session_type", "labels"]
    pred_df["labels"] = pred_df["labels"].apply(lambda x: " ".join(map(str, x)))
    pred_df.to_csv(
        Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_validation.csv" if Config.VALIDATION else Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_sub.csv",
        index=False
    )


pred_df.head()

In [None]:
with trace.timer("delete temp file"):
    del test_df
    del top_15_carts2orders, top_15_buy2buy, top_20_clicks, top_clicks, top_carts, top_orders
    del pred_clicks, pred_buys
    del pred_clicks_df, pred_carts_df, pred_orders_df
    
    gc.collect()

In [None]:
with trace.timer("compute cv score"):
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet(Config.INPUT_DIR / "test_labels.parquet")
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')
    
    del sub, test_labels, recall, score
    gc.collect()