# exp006
- [Item type vs multiple clicks vs latest items](https://www.kaggle.com/code/ingvarasgalinskas/item-type-vs-multiple-clicks-vs-latest-items)

In [3]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from itertools import chain

from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing
import gc
import glob

import warnings
warnings.filterwarnings('ignore')


INPUT_DIR = Path("../../input")

In [2]:
INPUT_DIR = Path("../../input")
def get_input_data(input_dir: Path, phase: str):
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = pd.read_parquet(path)
        # chunk["session_int32"] = chunk["session"].astype("int32")
        # chunk["aid_int32"] = chunk["aid"].astype("int32")
        # chunk["ts"] = (chunk["ts"] / 1000).astype("int32")
        dfs.append(chunk)
    
    return pd.concat(dfs, axis=0).reset_index(drop=True) #.astype({"ts": "datetime64[s]"})

train_df = get_input_data(INPUT_DIR, "train")
test_df = get_input_data(INPUT_DIR, "test")

### co-visitation matrix

In [3]:
next_aids = defaultdict(lambda: Counter())

def update_covisitation_counter(df, chunk_size=30000):
    sessions = df["session"].unique()

    for i in tqdm(range(0, len(sessions), chunk_size)):
        # get current chunk of sessions
        consecutive_aids = df.loc[sessions[i]: sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index(drop=True)
        # get the 30 most recent products per session
        consecutive_aids = consecutive_aids.groupby("session", as_index=False).nth(list(range(-30, 0))).reset_index(drop=True)
        # Merge sessions onto themselves so we can find pairs of products that are actioned on together
        consecutive_aids = consecutive_aids.merge(consecutive_aids, on=["session"])
        # Remove entries that are themselves
        consecutive_aids = consecutive_aids[consecutive_aids["aid_x"] != consecutive_aids["aid_y"]]
        # Calculate how many days between products
        consecutive_aids["days_elapsed"] = abs(consecutive_aids["ts_y"] - consecutive_aids["ts_x"])
        # Only keep products that are within a day of each other
        # consecutive_aids = consecutive_aids[(consecutive_aids["days_elapsed"] > 0) & (consecutive_aids["days_elapsed"] <= 1)]
        consecutive_aids = consecutive_aids[consecutive_aids["days_elapsed"] < 24 * 60 * 60 * 1000]


        consecutive_aids = consecutive_aids.drop_duplicates(subset=["session", "aid_x", "aid_y"]).reset_index(drop=True)
        for aid_x, aid_y in zip(
            consecutive_aids["aid_x"].to_numpy(), 
            consecutive_aids["aid_y"].to_numpy(),
            # consecutive_aids["ts_x"].to_numpy(),
            # consecutive_aids["type_y"].to_numpy()
            ):
            # w = 1 + 3 * (ts - 1659304800025)/(1662328791563 - 1659304800025)
            # HERE WE CAN BOOST WEIGHT, i.e. IF TYP=="ORDERS": W *= 10.0
            # THEN SAVE THIS MATRIX AS THE "ORDERS" MATRIX
            # WE CAN MAKE 3 DIFFERENT CO-VISITATION MATRICES
            next_aids[aid_x][aid_y] += 1

        del consecutive_aids
        gc.collect()

In [4]:
%%time
multiindex_df = train_df.copy()
multiindex_df.index = pd.MultiIndex.from_frame(train_df[["session"]])
update_covisitation_counter(multiindex_df)

del multiindex_df
gc.collect()

  3%|▎         | 14/430 [01:35<43:36,  6.29s/it] 

In [None]:
%%time
multiindex_df = test_df.copy()
multiindex_df.index = pd.MultiIndex.from_frame(test_df[["session"]])
update_covisitation_counter(multiindex_df)

del multiindex_df
gc.collect()

  0%|          | 0/56 [00:00<?, ?it/s]

19

In [None]:
aid_top_20 = []
for aid, cnt in tqdm(next_aids.items()):
    aid_top_20.append({"aid1": aid, "aid2": [aid2 for aid2, freq in cnt.most_common(20)]})

aid_top_20_df = pd.DataFrame(aid_top_20).set_index("aid1")
aid_top_20 = aid_top_20_df["aid2"].to_dict()
len(aid_top_20)

  0%|          | 0/1837169 [00:00<?, ?it/s]

### inference

In [29]:
def suggest_aids(df):
    # REMOVE DUPLICATE AIDS AND REVERSE ORDER OF LIST
    aids = list(dict.fromkeys(df["aid"].to_list()[::-1]))
    
    if len(aids) >= 20:
        # We have enough events in the test session
        return aids[:20]
    
    # Append it with AIDs from the co-visitation matrix.
    aids2 = list(chain(*[aid_top_20[aid] for aid in aids if aid in aid_top_20]))
    top_aids20 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in aids]
    
    return list(aids) + top_aids20[:20 - len(aids)]

In [30]:
pred_df = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_aids(x)
)
pred_df.head()

session
12899779    [59625, 438191, 731692, 1790770, 737445, 12535...
12899780    [1142000, 736515, 973453, 582732, 1502122, 889...
12899781    [918667, 199008, 194067, 57315, 141736, 146057...
12899782    [1007613, 595994, 1033148, 834354, 479970, 169...
12899783    [1817895, 607638, 1754419, 1216820, 1729553, 3...
dtype: object

In [51]:
# numpy配列
pred = test_df.sort_values(["session", "ts"]).groupby(["session"])["aid"].apply(list)

rec_list = []
for i in pred.to_numpy():
    aids = list(dict.fromkeys(i[::-1]))
    if len(aids) >= 20:
        rec_list.append(aids)
    else:
        aids2 = list(chain(*[aid_top_20[aid] for aid in aids if aid in aid_top_20]))
        top_aids20 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in aids]
        rec = list(aids) + top_aids20[:20 - len(aids)]
        rec_list.append(rec)


len(rec_list)

1671803

In [49]:
# numpy配列かつapplyをなくす
rec_list = []
for i, group in test_df.sort_values(["session", "ts"]).groupby(["session"]):
    aids = list(dict.fromkeys(group["aid"].to_list()[::-1]))
    if len(aids) >= 20:
        rec_list.append(aids)
    else:
        aids2 = list(chain(*[aid_top_20[aid] for aid in aids if aid in aid_top_20]))
        top_aids20 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in aids]
        rec = list(aids) + top_aids20[:20 - len(aids)]
        rec_list.append(rec)

len(rec_list)

1671803

In [31]:
data_list = []

for session_id, sorted_aids in tqdm(pred_df.reset_index().to_numpy()):
    sorted_aids_20_str = " ".join(map(str, sorted_aids))
    data_list.append([f"{session_id}_clicks", sorted_aids_20_str])
    data_list.append([f"{session_id}_carts", sorted_aids_20_str])
    data_list.append([f"{session_id}_orders", sorted_aids_20_str])

submit_df = pd.DataFrame(data_list, columns=["session_type", "labels"])
submit_df

  0%|          | 0/1671803 [00:00<?, ?it/s]

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 438191 731692 1790770 737445 1253524 162...
1,12899779_carts,59625 438191 731692 1790770 737445 1253524 162...
2,12899779_orders,59625 438191 731692 1790770 737445 1253524 162...
3,12899780_clicks,1142000 736515 973453 582732 1502122 889686 17...
4,12899780_carts,1142000 736515 973453 582732 1502122 889686 17...
...,...,...
5015404,14571580_carts,202353 1314576 433425 1231403 871658 925638 35...
5015405,14571580_orders,202353 1314576 433425 1231403 871658 925638 35...
5015406,14571581_clicks,1100210 1684953 462056 1158237 1401429 622489 ...
5015407,14571581_carts,1100210 1684953 462056 1158237 1401429 622489 ...


In [32]:
Path("../../output/exp005").mkdir(exist_ok=True)
submit_df.to_csv("../../output/exp005/sub_exp005.csv", index=False)