In [1]:
import numpy as np
import pandas as pd
import cudf
from collections import Counter
import itertools
import gc

from pathlib import Path
from contextlib import contextmanager
import math
import os
import subprocess
import sys
import time
import psutil
import torch

# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=False)


cudf.set_option("default_integer_bitwidth", 32)
cudf.set_option("default_float_bitwidth", 32)

In [2]:
class Config:
    EXP_ID = "009"
    VALIDATION = True
    INPUT_DIR = Path("../../input/train_valid") if VALIDATION else Path("../../input")
    OUTPUT_DIR = Path(f"../../output/exp{EXP_ID}")
    CANDIDATE_FEATURE_DIR = OUTPUT_DIR / "cv_feature" if VALIDATION else OUTPUT_DIR / "feature"
    DISK_PIECES = 4


Path.mkdir(Config.OUTPUT_DIR, exist_ok=True)
Path.mkdir(Config.CANDIDATE_FEATURE_DIR, exist_ok=True)

In [3]:
# =================================
# Utils
# =================================
def get_input_data(input_dir: Path, phase: str):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = cudf.read_parquet(path)
        chunk["session"] = chunk["session"].astype("int32")
        chunk["aid"] = chunk["aid"].astype("int32")
        chunk["ts"] = (chunk["ts"] / 1000).astype("int32")
        chunk["type"] = chunk["type"].map(type_labels).astype("int8")
        dfs.append(chunk)
    
    return cudf.concat(dfs, axis=0, ignore_index=True)


def get_gpu_memory(cmd_path="nvidia-smi",
                   target_properties=("memory.total", "memory.used")):
    """
    ref: https://www.12-technology.com/2022/01/pythongpu.html
    Returns
    -------
    gpu_total : ndarray,  "memory.total"
    gpu_used: ndarray, "memory.used"
    """

    # format option
    format_option = "--format=csv,noheader,nounits"

    cmd = '%s --query-gpu=%s %s' % (cmd_path, ','.join(target_properties), format_option)

    # Command execution in sub-processes
    cmd_res = subprocess.check_output(cmd, shell=True)

    gpu_lines = cmd_res.decode().split('\n')[0].split(', ')

    gpu_total = int(gpu_lines[0]) / 1024
    gpu_used = int(gpu_lines[1]) / 1024

    gpu_total = np.round(gpu_used, 1)
    gpu_used = np.round(gpu_used, 1)
    return gpu_total, gpu_used


class Trace():
    cuda = torch.cuda.is_available()

    @contextmanager
    def timer(self, title):
        t0 = time.time()
        p = psutil.Process(os.getpid())
        cpu_m0 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m0 = get_gpu_memory()[0]
        yield
        cpu_m1 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m1 = get_gpu_memory()[0]

        cpu_delta = cpu_m1 - cpu_m0
        if self.cuda: gpu_delta = gpu_m1 - gpu_m0

        cpu_sign = '+' if cpu_delta >= 0 else '-'
        cpu_delta = math.fabs(cpu_delta)

        if self.cuda: gpu_sign = '+' if gpu_delta >= 0 else '-'
        if self.cuda: gpu_delta = math.fabs(gpu_delta)

        cpu_message = f'{cpu_m1:.1f}GB({cpu_sign}{cpu_delta:.1f}GB)'
        if self.cuda: gpu_message = f'{gpu_m1:.1f}GB({gpu_sign}{gpu_delta:.1f}GB)'

        if self.cuda:
            message = f"[cpu: {cpu_message}, gpu: {gpu_message}: {time.time() - t0:.1f}sec] {title} "
        else:
            message = f"[cpu: {cpu_message}: {time.time() - t0:.1f}sec] {title} "

        print(message, file=sys.stderr)
        
trace = Trace()

In [4]:
# =================================
# Candidate features
# =================================
def read_file(f):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    df = cudf.read_parquet(f)
    df["session"] = df["session"].astype("int32")
    df["aid"] = df["aid"].astype("int32")
    df["ts"] = (df["ts"] / 1000).astype("int32")
    df["type"] = df["type"].map(type_labels).astype("int8")
    
    return df


def get_type_weighted_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    type_weight = {0: 1, 1: 6, 2: 3}
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df["n"] = df.groupby("session").cumcount()
                df = df.loc[df["n"] < 30].drop("n",axis=1)
                # CREATE PAIRS
                df = df.merge(df, on="session")
                df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part * SIZE) & (df.aid_x < (part+1) * SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = df.type_y.map(type_weight)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(["aid_x", "wgt"], ascending=[True, False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp["n"] = tmp.groupby("aid_x").aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 15].drop("n", axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_buy2buy_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 1
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.loc[df["type"].isin([1, 2])] # ONLY WANT CARTS AND ORDERS
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)] # 14days
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = 1
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_buy2buy_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_clicks_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', "ts_x", 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df["wgt"] = 1 + 3 * (df["ts_x"] - 1659304800) / (1662328791 - 1659304800)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 20].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{part}.pqt")
        
    del tmp
    gc.collect()

In [5]:
# from gensim.models import Word2Vec
# import polars as pl
# def get_input_data(input_dir: Path, phase: str):
#     type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
#     dfs = []
#     for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
#         chunk = pl.read_parquet(path)
#         chunk = chunk.with_columns([
#             pl.col("session").cast(pl.Int32),
#             pl.col("aid").cast(pl.Int32),
#             (pl.col("ts") / 1000).cast(pl.Int32),
#             pl.col("type").apply(lambda x: type_labels[x]).cast(pl.Int32)
#         ])
#         dfs.append(chunk)
    
#     return pl.concat(dfs)


# train_df = get_input_data(Config.INPUT_DIR, "train")
# train_df.with_column((pl.col("aid").cast(str) + "_" +  pl.col("type").cast(str)).alias("aid_type"))
# df = train_df.groupby(["session"]).agg(pl.col("aid"))["aid"].to_list()

# with trace.timer("W2V"):
#     w2vec = Word2Vec(sentences=df, vector_size=100, window=5, min_count=1, workers=-1, sg=1)

In [6]:
with trace.timer("cart_co_matirx"):
    get_type_weighted_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 2
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 3
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 4
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processi

[cpu: 1.4GB(+0.9GB), gpu: 0.6GB(+0.4GB): 153.7sec] cart_co_matirx 


In [7]:
with trace.timer("buy2buy_co_matirx"):
    get_buy2buy_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...


[cpu: 1.4GB(+0.0GB), gpu: 0.6GB(+0.0GB): 31.4sec] buy2buy_co_matirx 


In [8]:
with trace.timer("clicks_co_matirx"):
    get_clicks_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 2
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 3
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 4
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processi

[cpu: 1.5GB(+0.0GB), gpu: 0.6GB(+0.0GB): 150.1sec] clicks_co_matirx 


### Rerank

In [29]:
type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def parquet2dict(path: Path) -> dict:
    df = pd.read_parquet(path)
    return df.groupby(["aid_x"])["aid_y"].apply(list).to_dict()
    # return df.groupby(["aid_x"]).parallel_apply(lambda x: x["aid_y"].to_list()).to_dict()

    
def suggest_clicks(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter()
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    aids4 = [aid for aid in list(top_clicks) if aid not in result]
    return result + aids4[:20 - len(result)]


def suggest_carts(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==0) | (df['type']==1)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
    # aids4 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    aids4 = [aid for aid in list(top_carts) if aid not in result]
    return result + aids4[:20 - len(result)]


def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==1) | (df['type']==2)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_aids if aid in top_15_carts2orders]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
    aids5 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3 + aids5).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    aids4 = [aid for aid in list(top_orders) if aid not in result]
    return result + aids4[:20 - len(result)]

In [6]:
with trace.timer("load test set"):
    test_df = get_input_data(Config.INPUT_DIR, "test").to_pandas()

[cpu: 1.6GB(+1.0GB), gpu: 0.6GB(+0.4GB): 4.1sec] load test set 


In [7]:
with trace.timer("load candidate feature"):
    # type_weighted
    top_15_carts2orders = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_carts_orders_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_15_carts2orders.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{i}.pqt"))
    
    # buy2buy
    top_15_buy2buy = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_buy2buy_0.pqt")
    
    # clicks
    top_20_clicks = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_20_clicks_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_20_clicks.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{i}.pqt"))
        
    top_clicks = test_df.loc[test_df["type"] == 0, "aid"].value_counts().index.to_numpy()[:20]
    top_carts = test_df.loc[test_df["type"] == 1, "aid"].value_counts().index.to_numpy()[:20]
    top_orders = test_df.loc[test_df["type"] == 2, "aid"].value_counts().index.to_numpy()[:20]

[cpu: 5.2GB(+3.6GB), gpu: 0.6GB(+0.0GB): 71.8sec] load candidate feature 


In [24]:
with trace.timer("parallel apply"):
    pred_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_clicks)

[cpu: 11.0GB(+0.6GB), gpu: 0.6GB(+0.0GB): 151.2sec] parallel apply 


In [25]:
with trace.timer("parallel apply"):
    pred_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_carts)

[cpu: 11.0GB(+0.0GB), gpu: 0.6GB(+0.0GB): 762.5sec] parallel apply 


In [30]:
with trace.timer("parallel apply"):
    pred_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_buys)

[cpu: 12.6GB(+0.6GB), gpu: 0.6GB(+0.0GB): 732.3sec] parallel apply 


In [31]:
with trace.timer("make pred_df"):
    pred_clicks_df = pd.DataFrame(pred_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
    pred_carts_df = pd.DataFrame(pred_carts.add_suffix("_carts"), columns=["labels"]).reset_index()
    pred_orders_df = pd.DataFrame(pred_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
    pred_df = pd.concat([pred_clicks_df, pred_carts_df, pred_orders_df], axis=0).reset_index(drop=True)
    pred_df.columns = ["session_type", "labels"]
    pred_df["labels"] = pred_df["labels"].apply(lambda x: " ".join(map(str, x)))
    pred_df.to_csv(
        Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_validation.csv" if Config.VALIDATION else Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_sub.csv",
        index=False
    )


pred_df.head()

[cpu: 13.4GB(+0.8GB), gpu: 0.6GB(+0.0GB): 45.8sec] make pred_df 


Unnamed: 0,session_type,labels
0,11098528_clicks,11830 588923 1732105 571762 884502 876129 1157...
1,11098529_clicks,1105029 459126 1339838 295362 1544564 441348 5...
2,11098530_clicks,409236 264500 1603001 963957 254154 583026 364...
3,11098531_clicks,396199 1271998 452188 1728212 1365569 624163 1...
4,11098532_clicks,876469 7651 108125 1202618 1159379 77906 17040...


In [37]:
pd.DataFrame(pred_clicks[:100], columns=["labels"]).explode("labels")

  pd.DataFrame(pred_clicks[:100], columns=["labels"]).explode("labels")


Unnamed: 0_level_0,labels
session,Unnamed: 1_level_1
11098528,11830
11098528,588923
11098528,1732105
11098528,571762
11098528,884502
...,...
11098627,987685
11098627,889077
11098627,1181342
11098627,1595379


### validation score

In [12]:
with trace.timer("delete temp file"):
    del test_df
    del top_15_carts2orders, top_15_buy2buy, top_20_clicks, top_clicks, top_carts, top_orders
    del pred_clicks, pred_buys
    del pred_clicks_df, pred_carts_df, pred_orders_df
    
    gc.collect()

[cpu: 6.7GB(-1.8GB), gpu: 0.6GB(+0.0GB): 3.8sec] delete temp file 


In [32]:
with trace.timer("compute cv score"):
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet(Config.INPUT_DIR / "test_labels.parquet")
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')
    
    del sub, test_labels, recall, score
    gc.collect()

clicks recall = 0.5232806357769533
carts recall = 0.40939401701097855
orders recall = 0.6521574114279282
Overall Recall = 0.5664407155377458


[cpu: 14.1GB(+0.7GB), gpu: 0.6GB(+0.0GB): 71.8sec] compute cv score 


1. 公開notebook通り
- clicks recall = 0.5232863896228239
- carts recall = 0.4094081486650003
- orders recall = 0.6517810906868657
- Overall Recall = 0.5662197379739019

2. suggest_carts変更
- clicks recall = 0.5232806357769533
- carts recall = 0.4099009900990099
- orders recall = 0.6517810906868657
- Overall Recall = 0.5663670150195177

3. suggest_carts変更
- clicks recall = 0.5232806357769533
- carts recall = 0.41061817154061525
- orders recall = 0.6517810906868657
- Overall Recall = 0.5665821694519992

4. suggest_carts変更
- 最後にtop_clicks etcをaddする際に重複するaidを除去