In [1]:
import numpy as np
import pandas as pd
import cudf
from collections import Counter
import itertools
import gc
from multiprocessing import Pool

from pathlib import Path
from contextlib import contextmanager
import math
import os
import subprocess
import sys
import time
import psutil
import torch

import polars as pl
from gensim.models import Word2Vec, KeyedVectors
import hashlib
from cuml.neighbors import NearestNeighbors
import cupy as cp

cudf.set_option("default_integer_bitwidth", 32)
cudf.set_option("default_float_bitwidth", 32)
os.environ["PYTHONHASHSEED"] = str(42)

In [2]:
class Config:
    EXP_ID = "011"
    VALIDATION = False
    INPUT_DIR = Path("../../input/train_valid") if VALIDATION else Path("../../input")
    OUTPUT_DIR = Path(f"../../output/exp{EXP_ID}")
    CANDIDATE_FEATURE_DIR = OUTPUT_DIR / "cv_feature" if VALIDATION else OUTPUT_DIR / "feature"
    DISK_PIECES = 4


Path.mkdir(Config.OUTPUT_DIR, exist_ok=True)
Path.mkdir(Config.CANDIDATE_FEATURE_DIR, exist_ok=True)

In [3]:
# =================================
# Utils
# =================================
def get_input_data(input_dir: Path, phase: str):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = cudf.read_parquet(path)
        chunk["session"] = chunk["session"].astype("int32")
        chunk["aid"] = chunk["aid"].astype("int32")
        chunk["ts"] = (chunk["ts"] / 1000).astype("int32")
        chunk["type"] = chunk["type"].map(type_labels).astype("int8")
        dfs.append(chunk)
        
    del chunk
    gc.collect()
    
    return cudf.concat(dfs, axis=0, ignore_index=True)


def get_pl_input_data(input_dir: Path, phase: str):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = pl.read_parquet(path)
        chunk = chunk.with_columns([
            pl.col("session").cast(pl.Int32),
            pl.col("aid").cast(pl.Int32),
            (pl.col("ts") / 1000).cast(pl.Int32),
            pl.col("type").apply(lambda x: type_labels[x]).cast(pl.Int32)
        ])
        dfs.append(chunk)
    
    return pl.concat(dfs)


def get_gpu_memory(cmd_path="nvidia-smi",
                   target_properties=("memory.total", "memory.used")):
    """
    ref: https://www.12-technology.com/2022/01/pythongpu.html
    Returns
    -------
    gpu_total : ndarray,  "memory.total"
    gpu_used: ndarray, "memory.used"
    """

    # format option
    format_option = "--format=csv,noheader,nounits"

    cmd = '%s --query-gpu=%s %s' % (cmd_path, ','.join(target_properties), format_option)

    # Command execution in sub-processes
    cmd_res = subprocess.check_output(cmd, shell=True)

    gpu_lines = cmd_res.decode().split('\n')[0].split(', ')

    gpu_total = int(gpu_lines[0]) / 1024
    gpu_used = int(gpu_lines[1]) / 1024

    gpu_total = np.round(gpu_used, 1)
    gpu_used = np.round(gpu_used, 1)
    return gpu_total, gpu_used


class Trace():
    cuda = torch.cuda.is_available()

    @contextmanager
    def timer(self, title):
        t0 = time.time()
        p = psutil.Process(os.getpid())
        cpu_m0 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m0 = get_gpu_memory()[0]
        yield
        cpu_m1 = p.memory_info().rss / 2. ** 30
        if self.cuda: gpu_m1 = get_gpu_memory()[0]

        cpu_delta = cpu_m1 - cpu_m0
        if self.cuda: gpu_delta = gpu_m1 - gpu_m0

        cpu_sign = '+' if cpu_delta >= 0 else '-'
        cpu_delta = math.fabs(cpu_delta)

        if self.cuda: gpu_sign = '+' if gpu_delta >= 0 else '-'
        if self.cuda: gpu_delta = math.fabs(gpu_delta)

        cpu_message = f'{cpu_m1:.1f}GB({cpu_sign}{cpu_delta:.1f}GB)'
        if self.cuda: gpu_message = f'{gpu_m1:.1f}GB({gpu_sign}{gpu_delta:.1f}GB)'

        if self.cuda:
            message = f"[cpu: {cpu_message}, gpu: {gpu_message}: {time.time() - t0:.1f}sec] {title} "
        else:
            message = f"[cpu: {cpu_message}: {time.time() - t0:.1f}sec] {title} "

        print(message, file=sys.stderr)
        
trace = Trace()

In [4]:
# =================================
# Candidate features
# =================================
def read_file(f):
    type_labels = {"clicks": 0, "carts": 1, "orders": 2}
    df = cudf.read_parquet(f)
    df["session"] = df["session"].astype("int32")
    df["aid"] = df["aid"].astype("int32")
    df["ts"] = (df["ts"] / 1000).astype("int32")
    df["type"] = df["type"].map(type_labels).astype("int8")
    
    return df


def get_type_weighted_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    type_weight = {0: 1, 1: 6, 2: 3}
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df["n"] = df.groupby("session").cumcount()
                df = df.loc[df["n"] < 30].drop("n",axis=1)
                # CREATE PAIRS
                df = df.merge(df, on="session")
                df = df.loc[((df.ts_x - df.ts_y).abs() < 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part * SIZE) & (df.aid_x < (part+1) * SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = df.type_y.map(type_weight)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(["aid_x", "wgt"], ascending=[True, False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp["n"] = tmp.groupby("aid_x").aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 15].drop("n", axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_buy2buy_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 1
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.loc[df["type"].isin([1, 2])] # ONLY WANT CARTS AND ORDERS
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs() < 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y)] # 14days
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df['wgt'] = 1
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_15_buy2buy_{part}.pqt")
        
    del tmp
    gc.collect()
    
    
def get_clicks_co_visitation_matrix():
    files = sorted(list(Config.INPUT_DIR.glob("*_parquet/*.parquet"))) # test set leakを使う
    CHUNK = int(np.ceil(len(files)/6))
    READ_CT = 5
    # ref: https://www.kaggle.com/code/cdeotte/compute-validation-score-cv-565
    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    # OOM回避のため共起行列を分割して計算・保存
    DISK_PIECES = 4
    SIZE = 1.86e6 / DISK_PIECES
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for part in range(DISK_PIECES):
        print('### DISK PART', part+1)
        
        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # 共起行列を2層のチャンクで計算
        # => OUTER CHUNKS
        for j in range(6):
            a = j * CHUNK
            b = min((j+1) * CHUNK, len(files))
            print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
            
            # => INNER CHUNKS
            # CHUNKを更に分割して計算
            for k in range(a, b, READ_CT):
                # READ FILE
                dfs = [read_file(files[k])]
                for i in range(1, READ_CT):
                    if k+i < b:
                        dfs.append(read_file(files[k+i]))
                
                df = cudf.concat(dfs, ignore_index=True, axis=0)
                df = df.sort_values(by=["session", "ts"], ascending=[True, False]) # cumcountで最新30行を抽出するためにsessionを降順にしている
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df, on='session')
                df = df.loc[((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y)]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= part*SIZE) & (df.aid_x < (part+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', "ts_x", 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', "type_y"])
                df["wgt"] = 1 + 3 * (df["ts_x"] - 1659304800) / (1662328791 - 1659304800)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a:
                    tmp2 = df
                else:
                    tmp2 = tmp2.add(df, fill_value=0)
                    
             # COMBINE OUTER CHUNKS
            if a==0:
                tmp = tmp2
            else:
                tmp = tmp.add(tmp2, fill_value=0)
            
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 15
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp["n"] < 20].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{part}.pqt")
        
    del tmp
    gc.collect()

In [5]:
with trace.timer("load all data"):
    train_df = get_pl_input_data(Config.INPUT_DIR, "train")
    test_df = get_pl_input_data(Config.INPUT_DIR, "test")
    whole_df = pl.concat([train_df, test_df])
    # whole_df = whole_df.with_column((pl.col("aid").cast(str) + "_" +  pl.col("type").cast(str)).alias("aid_type"))
    
    del train_df, test_df
    gc.collect()

[cpu: 4.2GB(+3.5GB), gpu: 0.2GB(+0.0GB): 68.7sec] load all data 


In [6]:
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)


def get_item2vec(input_df: pl.DataFrame, type_label: int):
    corpus = input_df.filter(pl.col("type") == type_label).groupby(["session"]).agg(pl.col("aid"))["aid"].to_list()
    item2vec = Word2Vec(
        sentences=corpus,
        vector_size=100,
        window=20,
        min_count=1,
        sg=1,
        hs=0,
        hashfxn=hashfxn,
        epochs=300,
        seed=42,
        workers=-1
    )
    item2vec.wv.save(str(Config.CANDIDATE_FEATURE_DIR / f"item2vec_{type_label}.wordvectors"))
    
    del corpus, item2vec
    gc.collect()
    
    
# def get_item2vec(input_df: pl.DataFrame):
#     corpus = input_df.groupby(["session"]).agg(pl.col("aid_type"))["aid_type"].to_list()
#     item2vec = Word2Vec(
#         sentences=corpus,
#         vector_size=100,
#         window=20,
#         min_count=1,
#         sg=1,
#         hs=0,
#         hashfxn=hashfxn,
#         epochs=100,
#         seed=42,
#         workers=-1
#     )
#     item2vec.wv.save(str(Config.CANDIDATE_FEATURE_DIR / f"item2vec.wordvectors"))
    
#     del corpus, item2vec
#     gc.collect()
    
    
# with trace.timer("item2vec"):
#     get_item2vec(whole_df)

In [8]:
with trace.timer("clicks item2vec"):
    get_item2vec(whole_df, 0)

[cpu: 21.4GB(+17.2GB), gpu: 0.2GB(+0.0GB): 575.9sec] clicks item2vec 


In [7]:
with trace.timer("carts item2vec"):
    get_item2vec(whole_df, 1)

[cpu: 11.4GB(+7.3GB), gpu: 0.2GB(+0.0GB): 418.9sec] carts item2vec 


In [7]:
with trace.timer("orders item2vec"):
    get_item2vec(whole_df, 2)

[cpu: 6.4GB(+2.3GB), gpu: 0.2GB(+0.0GB): 210.5sec] orders item2vec 


In [5]:
with trace.timer("cart_co_matirx"):
    get_type_weighted_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 2
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 3
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 4
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processi

[cpu: 3.0GB(+2.3GB), gpu: 1.2GB(+1.0GB): 171.3sec] cart_co_matirx 


In [6]:
with trace.timer("buy2buy_co_matirx"):
    get_buy2buy_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...


[cpu: 3.0GB(+0.0GB), gpu: 1.2GB(+0.0GB): 33.3sec] buy2buy_co_matirx 


In [7]:
with trace.timer("clicks_co_matirx"):
    get_clicks_co_visitation_matrix()

### DISK PART 1
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 2
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 3
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processing files 50 thru 74 in groups of 5...
Processing files 75 thru 99 in groups of 5...
Processing files 100 thru 124 in groups of 5...
Processing files 125 thru 145 in groups of 5...
### DISK PART 4
Processing files 0 thru 24 in groups of 5...
Processing files 25 thru 49 in groups of 5...
Processi

[cpu: 3.0GB(-0.0GB), gpu: 1.2GB(+0.0GB): 166.1sec] clicks_co_matirx 


### Rerank

In [5]:
type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def parquet2dict(path: Path) -> dict:
    df = pd.read_parquet(path)
    return df.groupby(["aid_x"])["aid_y"].apply(list).to_dict()
    # return df.groupby(["aid_x"]).parallel_apply(lambda x: x["aid_y"].to_list()).to_dict()

    
def suggest_clicks(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter()
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # USE item2vec
    aids3 = list(itertools.chain(*[list(click_index2key[click_indices[click_model.key_to_index[aid]][1:]]) for aid in unique_aids if aid in click_model.key_to_index]))
    # RERANK CANDIDATES
    top_aids2 = [aid for aid, cnt in Counter(aids2 + aids3).most_common(20) if aid not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:20-len(result)]


def suggest_carts(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==0) | (df['type']==1)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_buys if aid in top_15_carts2orders]))
    # USE item2vec
    aids4 = list(itertools.chain(*[list(cart_index2key[cart_indices[cart_model.key_to_index[aid]][1:]]) for aid in unique_aids if aid in cart_model.key_to_index]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3 + aids4).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST ORDERS
    return result + list(top_carts)[:20-len(result)]


def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    aids = df["aid"].to_list()
    types = df["type"].to_list()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1]))
    df = df.loc[(df['type']==1) | (df['type']==2)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids) >= 20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
        for aid in aids3:
            aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        
        return sorted_aids
    
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_15_carts2orders[aid] for aid in unique_aids if aid in top_15_carts2orders]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_15_buy2buy[aid] for aid in unique_buys if aid in top_15_buy2buy]))
    # USE item2vec
    aids4 = list(itertools.chain(*[list(order_index2key[order_indices[order_model.key_to_index[aid]][1:]]) for aid in unique_aids if aid in order_model.key_to_index]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3 + aids4).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:20-len(result)]

In [6]:
# type_weight_multipliers = {0: 1, 1: 6, 2: 3}
# N_CORES = psutil.cpu_count()     # Available CPU cores


# def df_parallelize_run(func, t_split):
    
#     num_cores = np.min([N_CORES, len(t_split)])
#     pool = Pool(num_cores)
#     df = pool.map(func, t_split)
#     pool.close()
#     pool.join()
    
#     return df


# def suggest_clicks(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     # df = df.loc[(df['type']==0)]
#     # unique_clicks = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_clicks = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [0]][::-1]))
    
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in click_session2index:
#         # USE item2vec
#         aids2 = index2key[click_indices[click_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [0, 1, 2]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_aids]    
#         result = unique_aids + top_aids2[:20 - len(unique_aids)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_clicks)[:20 - len(result)]


# def suggest_carts(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     df = df.loc[(df['type'] == 0) | (df['type'] == 1)]
#     unique_carts = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_carts = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [1]][::-1]))
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in cart_session2index:
#         # USE item2vec
#         aids2 = index2key[cart_indices[cart_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [0, 1]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_carts]    
#         result = unique_carts + top_aids2[:20 - len(unique_carts)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_carts)[:20 - len(result)]


# def suggest_orders(df):
#     # USE USER HISTORY AIDS AND TYPES
#     aids = df["aid"].to_list()
#     types = df["type"].to_list()
#     session = df["session"].unique()[0]
#     # UNIQUE AIDS
#     unique_aids = list(dict.fromkeys(aids[::-1]))
#     df = df.loc[(df['type'] == 1) | (df['type'] == 2)]
#     unique_orders = list(dict.fromkeys(df.aid.tolist()[::-1]))
#     # # USE USER HISTORY AIDS AND TYPES
#     # session = df[0]
#     # aids = df[1]
#     # types = df[2]
#     # # UNIQUE AIDS
#     # unique_aids = list(dict.fromkeys(aids[::-1]))
#     # unique_orders = list(dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [2]][::-1]))
#     # RERANK CANDIDATES USING WEIGHTS
#     if len(unique_aids) >= 20:
#         weights = np.logspace(0.1, 1, len(aids),base=2, endpoint=True) - 1
#         aids_temp = Counter()
#         # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
#         for aid, w, t in zip(aids, weights, types): 
#             aids_temp[aid] += w * type_weight_multipliers[t]
#         sorted_aids = [k for k, v in aids_temp.most_common(20)]
        
#         return sorted_aids
    
#     if session in order_session2index:
#         # USE item2vec
#         aids2 = index2key[order_indices[order_session2index[session]]]
#         # aids2 = list(aids2)
#         aids2 = [aid.split("_")[0] for aid in aids2 if int(aid.split("_")[1]) in [1, 2]]
#         # RERANK CANDIDATES
#         top_aids2 = [aid2 for aid2 in aids2 if aid2 not in unique_orders]    
#         result = unique_orders + top_aids2[:20 - len(unique_orders)]
#     else:
#         result = []
    
#     # USE TOP20 TEST CLICKS
#     return result + list(top_orders)[:20 - len(result)]

In [7]:
# def get_similar_items_by_item2vec(item2vec_model, session_type):
#     input_df = get_input_data(Config.INPUT_DIR, "test")
#     # last 30 events per session
#     input_df = input_df.sort_values(by=["session", "ts"], ascending=[True, False])
#     input_df["n"] = input_df.groupby(["session"]).cumcount()
#     input_df = input_df.loc[input_df["n"] < 30].reset_index(drop=True)
#     # item2vecに含まれるaidにfintering
#     input_df["aid_type"] = input_df["aid"].astype(str) + "_" + input_df["type"].astype(str)
#     input_df = input_df.loc[input_df["aid_type"].isin(item2vec_model.index_to_key)].reset_index(drop=True)
#     # input_df = input_df.loc[input_df["aid"].isin(item2vec_model.index_to_key)].reset_index(drop=True)

#     # sessionにtype == clickがないイベントログ
#     input_df["is_type"] = input_df["type"].isin(session_type).astype("int8")
#     input_df["count_type"] = input_df.groupby("session")["is_type"].transform("sum")

#     output_df = input_df.loc[(input_df["type"].isin(session_type)) & (input_df["count_type"] > 0)]
#     output_df = cudf.concat([output_df, input_df.loc[input_df["count_type"] == 0]], ignore_index=True)
#     output_df = output_df.sort_values(by=["session", "ts"]).reset_index(drop=True).drop(columns=["is_type", "count_type"])
    
#     session_idx = np.cumsum(output_df.groupby(["session"], sort=True).size().to_numpy())
#     session2index = {k: v for v, k in enumerate(sorted(output_df["session"].unique().to_pandas().to_list()))}
    
#     # session embedding
#     session_embeddings = item2vec_model[output_df["aid_type"].to_numpy()]
#     # session_embeddings = item2vec_model[output_df["aid"].to_numpy()]
#     session_embeddings = np.split(session_embeddings, session_idx[:-1])
#     session_embeddings = np.array([np.mean(i, axis=0) for i in session_embeddings])
    
#     # most similars
#     model = NearestNeighbors(n_neighbors=20, metric="cosine", output_type="numpy")
#     model.fit(item2vec_model.vectors)
    
#     _, indices = model.kneighbors(session_embeddings)
    
#     del input_df, output_df, session_embeddings, model
#     gc.collect()
    
#     return session2index, indices

In [6]:
def get_similar_items(word2vec):
    X = cp.array(word2vec.vectors)
    model = NearestNeighbors(n_neighbors=21, metric="cosine", output_type="numpy")
    model.fit(X)
    
    _, indices = model.kneighbors(X)
    
    del X, model
    gc.collect()
    
    return indices

In [7]:
with trace.timer("load test set"):
    test_df = get_input_data(Config.INPUT_DIR, "test").to_pandas()

[cpu: 3.1GB(+2.5GB), gpu: 1.2GB(+1.0GB): 10.6sec] load test set 


In [8]:
with trace.timer("load candidate feature"):
    # type_weighted
    top_15_carts2orders = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_carts_orders_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_15_carts2orders.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_15_carts_orders_{i}.pqt"))
    
    # buy2buy
    top_15_buy2buy = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_15_buy2buy_0.pqt")
    
    # clicks
    top_20_clicks = parquet2dict(Config.CANDIDATE_FEATURE_DIR / "top_20_clicks_0.pqt")
    for i in range(1, Config.DISK_PIECES):
        top_20_clicks.update(parquet2dict(Config.CANDIDATE_FEATURE_DIR / f"top_20_clicks_{i}.pqt"))
        
    top_clicks = test_df.loc[test_df["type"] == 0, "aid"].value_counts().index.to_numpy()[:20]
    top_carts = test_df.loc[test_df["type"] == 1, "aid"].value_counts().index.to_numpy()[:20]
    top_orders = test_df.loc[test_df["type"] == 2, "aid"].value_counts().index.to_numpy()[:20]
    
    # clicks
    click_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_0.wordvectors"), mmap="r")
    click_index2key = np.array(click_model.index_to_key)
    click_indices = get_similar_items(click_model)
    
    # carts
    cart_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_1.wordvectors"), mmap="r")
    cart_index2key = np.array(cart_model.index_to_key)
    cart_indices = get_similar_items(cart_model)
    
    # orders
    order_model = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec_2.wordvectors"), mmap="r")
    order_index2key = np.array(order_model.index_to_key)
    order_indices = get_similar_items(order_model)
    
    # all
    # item2vec = KeyedVectors.load(str(Config.CANDIDATE_FEATURE_DIR / "item2vec.wordvectors"), mmap="r")
    # index2key = np.array(item2vec.index_to_key)

[cpu: 12.0GB(+8.9GB), gpu: 1.5GB(+0.3GB): 277.3sec] load candidate feature 


In [11]:
# with trace.timer("load test set"):
#     temp = [group for name, group in test_df.sort_values(["session", "ts"]).groupby(["session"])]
#     test_list = [[i["session"].iloc[0], i["aid"].to_list(), i["type"].to_list()] for i in temp]
    
#     del temp, test_df
#     gc.collect()

In [9]:
with trace.timer("click inference"):
    # click_session2index, click_indices = get_similar_items_by_item2vec(item2vec, [0, 1, 2])
    # temp = df_parallelize_run(suggest_clicks, test_list)
    # pred_clicks = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_clicks)
    
    #del click_session2index, click_indices, click_model, click_index2key #temp
    gc.collect()

[cpu: 12.6GB(+0.6GB), gpu: 1.5GB(+0.0GB): 167.2sec] click inference 


In [10]:
with trace.timer("cart inference"):
    # cart_session2index, cart_indices = get_similar_items_by_item2vec(item2vec, [0, 1])
    # temp = df_parallelize_run(suggest_carts, test_list)
    # pred_carts = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_carts)
    
    #del cart_session2index, cart_indices, #cart_model, cart_index2key #temp
    gc.collect()

[cpu: 13.2GB(+0.6GB), gpu: 1.5GB(+0.0GB): 751.6sec] cart inference 


In [11]:
with trace.timer("order inference"):
    # order_session2index, order_indices = get_similar_items_by_item2vec(item2vec, [1, 2])
    # temp = df_parallelize_run(suggest_orders, test_list)
    # pred_orders = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
    pred_orders = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(suggest_buys)
    
    #del order_session2index, order_indices, #order_model, order_index2key #temp
    gc.collect()

[cpu: 13.8GB(+0.6GB), gpu: 1.5GB(+0.0GB): 700.7sec] order inference 


In [12]:
with trace.timer("make pred_df"):
    pred_clicks_df = pd.DataFrame(pred_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
    pred_carts_df = pd.DataFrame(pred_carts.add_suffix("_carts"), columns=["labels"]).reset_index()
    pred_orders_df = pd.DataFrame(pred_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
    pred_df = pd.concat([pred_clicks_df, pred_carts_df, pred_orders_df], axis=0).reset_index(drop=True)
    pred_df.columns = ["session_type", "labels"]
    pred_df["labels"] = pred_df["labels"].apply(lambda x: " ".join(map(str, x)))
    pred_df.to_csv(
        Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_validation.csv" if Config.VALIDATION else Config.OUTPUT_DIR / f"exp{Config.EXP_ID}_sub.csv",
        index=False
    )


pred_df.head()

[cpu: 15.2GB(+1.4GB), gpu: 1.5GB(+0.0GB): 37.1sec] make pred_df 


Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 17...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...


### validation score

In [None]:
with trace.timer("delete temp file"):
    del test_df
    del top_15_carts2orders, top_15_buy2buy, top_20_clicks, top_clicks, top_carts, top_orders
    del pred_clicks, pred_buys
    del pred_clicks_df, pred_carts_df, pred_orders_df
    
    gc.collect()

In [20]:
with trace.timer("compute cv score"):
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet(Config.INPUT_DIR / "test_labels.parquet")
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')
    
    del sub, test_labels, recall, score
    gc.collect()

clicks recall = 0.523267977316038
carts recall = 0.4093993163812367
orders recall = 0.6516749489393866
Overall Recall = 0.5661515620096067


[cpu: 16.6GB(+0.8GB), gpu: 1.3GB(+0.0GB): 137.6sec] compute cv score 
