In [1]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing
import gc
import glob

import warnings
warnings.filterwarnings('ignore')

DEBUG = False  
SAMPLING = 100  # Reduce it to improve performance
INPUT_DIR = Path("../../input")

In [6]:
def get_input_data(input_dir: Path, phase: str):
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        dfs.append(pd.read_parquet(path))
    
    return pd.concat(dfs, axis=0).astype({"ts": "datetime64[ms]"})

# train_df = get_input_data(INPUT_DIR, "train")
test_df = get_input_data(INPUT_DIR, "test")

### generate aid pairs

In [2]:
def gen_pairs(df):
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    df = pd.merge(df, df, on='session')
    pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y']].drop_duplicates()
    
    return pairs[['aid_x', 'aid_y']].to_numpy()


def gen_aid_pairs():
    all_pairs = defaultdict(lambda: Counter())
    all_pair_chunks = []

    with tqdm(glob.glob('../../input/*_parquet/*'), desc='Chunks') as prog:
        for idx, chunk_file in enumerate(prog):
        # for idx, chunk_file in enumerate(tqdm(sorted(list(INPUT_DIR.glob(f"*_parquet/*.parquet"))))):
            with multiprocessing.get_context("fork").Pool(4) as p:
                chunk = pd.read_parquet(chunk_file).drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk, 120))            
                pair_chunks = np.concatenate(pair_chunks, axis=0)
                all_pair_chunks.append(pair_chunks)

                if DEBUG and idx >= 3:
                    break
                del chunk, pair_chunks
                gc.collect()

    df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
    top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()

    return top_aids

In [4]:
top_40 = gen_aid_pairs()
len(top_40)

332929

In [None]:
# import ray
# ray.init(num_cpus=4)

In [None]:
# @ray.remote
# def gen_pairs_ray(df):
#     SAMPLING = 1000
#     df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
#     df = pd.merge(df, df, on='session')
#     pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y']].drop_duplicates()
    
#     return pairs[['aid_x', 'aid_y']].to_numpy()

# def gen_aid_pairs_ray():
#     all_pairs = defaultdict(lambda: Counter())
#     all_pair_chunks = []

#     with tqdm(glob.glob('../../input/*_parquet/*'), desc='Chunks') as prog:
#         for idx, chunk_file in enumerate(prog):
#             chunk = pd.read_parquet(chunk_file).drop(columns=['type'])
#             pair_chunks = ray.get([gen_pairs_ray.remote(i) for i in np.array_split(chunk, 120)])
#             # pair_chunks = p.map(gen_pairs, np.array_split(chunk, 120)) 
#             pair_chunks = np.concatenate(pair_chunks, axis=0)
#             all_pair_chunks.append(pair_chunks)

#             if DEBUG and idx >= 3:
#                 break
#             del chunk, pair_chunks
#             gc.collect()

#     df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
#     top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()

#     return top_aids

### test set inference

In [8]:
top_40_cnt = {aid: Counter(dict(top)) for aid, top in top_40.items()}

In [19]:
def suggest_aids(df):
    aids = df.tail(20).aid.tolist()
    
    if len(aids) >= 20:
        # We have enough events in the test session
        return aids
    
    # Append it with AIDs from the co-visitation matrix. 
    aids = set(aids)
    new_aids = Counter()
    for aid in aids:
        new_aids.update(top_40_cnt.get(aid))
    
    top_aids2 = [aid2 for aid2, cnt in new_aids.most_common(20) if aid2 not in aids]        
    return list(aids) + top_aids2[:20 - len(aids)]

In [20]:
pred_df = test_df.sort_values(["session", "type", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_aids(x)
)
pred_df.head()

session
12899779                                              [59625]
12899780    [1142000, 736515, 582732, 973453, 1502122, 158...
12899781    [199008, 57315, 141736, 918667, 194067, 146057...
12899782    [406001, 889671, 834354, 889671, 1099390, 9873...
12899783    [255297, 1114789, 1817895, 198385, 1729553, 17...
dtype: object

In [31]:
data_list = []

for session_id, sorted_aids in tqdm(pred_df.reset_index().to_numpy()):
    sorted_aids_20_str = " ".join(map(str, sorted_aids))
    data_list.append([f"{session_id}_clicks", sorted_aids_20_str])
    data_list.append([f"{session_id}_carts", sorted_aids_20_str])
    data_list.append([f"{session_id}_orders", sorted_aids_20_str])

submit_df = pd.DataFrame(data_list, columns=["session_type", "labels"])
submit_df

  0%|          | 0/1671803 [00:00<?, ?it/s]

Unnamed: 0,session_type,labels
0,12899779_clicks,59625
1,12899779_carts,59625
2,12899779_orders,59625
3,12899780_clicks,1142000 736515 582732 973453 1502122 1586171 4...
4,12899780_carts,1142000 736515 582732 973453 1502122 1586171 4...
...,...,...
5015404,14571580_carts,202353
5015405,14571580_orders,202353
5015406,14571581_clicks,1100210 219925 1547466 462056 1571699 618373 1...
5015407,14571581_carts,1100210 219925 1547466 462056 1571699 618373 1...


In [33]:
Path("../../output/exp003").mkdir(exist_ok=True)
submit_df.to_csv("../../output/exp003/sub_exp003_sampling_100.csv", index=False)