# exp004
- 共起行列の高速実装パターンを試す
- [Fast Co-Visitation Matrix](https://www.kaggle.com/code/dpalbrecht/fast-co-visitation-matrix/notebook)
- [Test Data Leak - LB Boost](https://www.kaggle.com/code/cdeotte/test-data-leak-lb-boost/notebook?scriptVersionId=110154433)

In [1]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing
import gc
import glob

import warnings
warnings.filterwarnings('ignore')

DEBUG = False
SAMPLING = 1  # Reduce it to improve performance
INPUT_DIR = Path("../../input")

In [2]:
INPUT_DIR = Path("../../input")
def get_input_data(input_dir: Path, phase: str):
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        chunk = pd.read_parquet(path)
        # chunk["session"] = chunk["session"].astype("int32")
        # chunk["ts"] = (chunk["ts"] / 1000).astype("int32")
        dfs.append(chunk)
    
    return pd.concat(dfs, axis=0).reset_index(drop=True) #.astype({"ts": "datetime64[s]"})

# train_df = get_input_data(INPUT_DIR, "train")
test_df = get_input_data(INPUT_DIR, "test")

### generate aid pairs - original

In [35]:
def gen_pairs(df):
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    df = pd.merge(df, df, on='session')
    pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y']].drop_duplicates()
    
    return pairs[['aid_x', 'aid_y']].to_numpy()


def gen_aid_pairs():
    all_pairs = defaultdict(lambda: Counter())
    all_pair_chunks = []

    with tqdm(glob.glob('../../input/test_parquet/*'), desc='Chunks') as prog:
        for idx, chunk_file in enumerate(prog):
            with multiprocessing.get_context("fork").Pool(4) as p:
                chunk = pd.read_parquet(chunk_file).drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk, 120))            
                pair_chunks = np.concatenate(pair_chunks, axis=0)
                all_pair_chunks.append(pair_chunks)

                if DEBUG and idx >= 3:
                    break
                del chunk, pair_chunks
                gc.collect()

    df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
    top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()

    return top_aids

In [36]:
top_40 = gen_aid_pairs()
len(top_40)

Chunks:   0%|          | 0/17 [00:00<?, ?it/s]

692061

In [9]:
def gen_pairs(df):
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    df = pd.merge(df, df, on='session')
    pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y']].drop_duplicates()
    
    return pairs[['aid_x', 'aid_y']].to_numpy()


def gen_aid_pairs():
    all_pairs = defaultdict(lambda: Counter())
    all_pair_chunks = []

    with tqdm(glob.glob('../../input/*_parquet/*'), desc='Chunks') as prog:
        with multiprocessing.get_context("fork").Pool(4) as p:
            for idx, chunk_file in enumerate(prog):
                chunk = pd.read_parquet(chunk_file).drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk, 120))            
                pair_chunks = np.concatenate(pair_chunks, axis=0)
                all_pair_chunks.append(pair_chunks)

                if DEBUG and idx >= 3:
                    break
                del chunk, pair_chunks
                gc.collect()

    df = pd.DataFrame(data=np.concatenate(all_pair_chunks), columns=['aid1', 'aid2'])
    top_aids = df.groupby('aid1').apply(lambda df: Counter(df.aid2).most_common(40)).to_dict()

    return top_aids

In [10]:
top_40 = gen_aid_pairs()
len(top_40)

Chunks:   0%|          | 0/146 [00:00<?, ?it/s]

332929

### generate aid pairs - Chris

In [69]:
import sys

def gen_pairs(df):
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    if df["session"].isin([12975575]).sum() != 0:
        print("include: 12975575")
    df = pd.merge(df, df, on='session')
    pairs = df.query('abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and aid_x != aid_y')[['session', 'aid_x', 'aid_y', 'ts_x', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    return pairs[['aid_x', 'aid_y', 'ts_x', 'type_y', 'session']].to_numpy()


def gen_aid_pairs():
    sessions = []
    all_pairs = defaultdict(lambda: Counter())
    with tqdm(glob.glob('../../input/test_parquet/*'), desc='Chunks') as prog:
        with multiprocessing.get_context("fork").Pool(4) as p:
            for idx, chunk_file in enumerate(prog):
                chunk = pd.read_parquet(chunk_file)#.drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk.head(100000000 if not DEBUG else 10000), 120))
                for pairs in pair_chunks:
                    for aid1, aid2, ts, typ, s in pairs:
                        w = 1 + 3 * (ts - 1659304800025)/(1662328791563-1659304800025)
                        # HERE WE CAN BOOST WEIGHT, i.e. IF TYP=="ORDERS": W *= 10.0
                        # THEN SAVE THIS MATRIX AS THE "ORDERS" MATRIX
                        # WE CAN MAKE 3 DIFFERENT CO-VISITATION MATRICES
                        all_pairs[aid1][aid2] +=w
                        sessions.append(s)
                prog.set_description(f'Mem: {sys.getsizeof(object) // (2 ** 20)}MB')

                if DEBUG and idx >= 2:
                    break
                del chunk, pair_chunks
                gc.collect()
                
    return all_pairs, sessions

In [70]:
top_40, top_40_sessions = gen_aid_pairs()
len(top_40)

Chunks:   0%|          | 0/17 [00:00<?, ?it/s]

include: 12975575
include: 12975575


692061

In [54]:
len(top_40)

692061

In [42]:
len(np.unique(top_40_sessions))

799564

### fast co-visitation matrix

In [90]:
next_aids = defaultdict(lambda: Counter())

def update_covisitation_counter(df, chunk_size=30000):
    session_cnt = []
    sessions = df["session"].unique()
    # sessions = df.session.unique()

    for i in tqdm(range(0, len(sessions), chunk_size)):
        # get current chunk of sessions
        # print(f"Now session : {i} to {min(sessions.shape[0]-1, i+chunk_size-1)}")
        consecutive_aids = df.loc[sessions[i]: sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index(drop=True)
        # get the 30 most recent products per session
        consecutive_aids = consecutive_aids.groupby("session", as_index=False).nth(list(range(-30, 0))).reset_index(drop=True)
        # Merge sessions onto themselves so we can find pairs of products that are actioned on together
        consecutive_aids = consecutive_aids.merge(consecutive_aids, on=["session"])
        # Remove entries that are themselves
        consecutive_aids = consecutive_aids[consecutive_aids["aid_x"] != consecutive_aids["aid_y"]]
        # Calculate how many days between products
        consecutive_aids["days_elapsed"] = abs(consecutive_aids["ts_y"] - consecutive_aids["ts_x"])
        # Only keep products that are within a day of each other
        # consecutive_aids = consecutive_aids[(consecutive_aids["days_elapsed"] > 0) & (consecutive_aids["days_elapsed"] <= 1)]
        consecutive_aids = consecutive_aids[consecutive_aids["days_elapsed"] < 24 * 60 * 60 * 1000]


        consecutive_aids = consecutive_aids.drop_duplicates(subset=["session", "aid_x", "aid_y"]).reset_index(drop=True)
        for aid_x, aid_y, s in zip(consecutive_aids["aid_x"].to_numpy(), consecutive_aids["aid_y"].to_numpy(), consecutive_aids["session"].to_numpy()):
            next_aids[aid_x][aid_y] += 1
            session_cnt.append(s)

    return session_cnt


# multiindex_df = train_df.copy()
# multiindex_df.index = pd.MultiIndex.from_frame(train_df[["session"]])
# update_covisitation_counter(multiindex_df)

multiindex_df = test_df.copy()
multiindex_df.index = pd.MultiIndex.from_frame(test_df[["session"]])
sessions = update_covisitation_counter(multiindex_df)

del multiindex_df

  0%|          | 0/56 [00:00<?, ?it/s]

In [91]:
len(next_aids)

691800

In [50]:
len(np.unique(sessions))

799651

In [75]:
len(set(top_40.keys()) - set(next_aids.keys()))

292

In [22]:
len(set(next_aids.keys()) - set(top_40.keys()))

31

In [97]:
multiindex_df = test_df[test_df["session"] == 14536702].copy()
multiindex_df.index = pd.MultiIndex.from_frame(multiindex_df[["session"]])

consecutive_aids = multiindex_df.reset_index(drop=True).groupby("session", as_index=False).nth(list(range(-30, 0))).reset_index(drop=True)
consecutive_aids = consecutive_aids.merge(consecutive_aids, on=["session"])
# Remove entries that are themselves
consecutive_aids = consecutive_aids[consecutive_aids["aid_x"] != consecutive_aids["aid_y"]]
# Calculate how many days between products
consecutive_aids["days_elapsed"] = abs(consecutive_aids["ts_y"] - consecutive_aids["ts_x"])
# Only keep products that are within a day of each other
# consecutive_aids = consecutive_aids[(consecutive_aids["days_elapsed"] > 0) & (consecutive_aids["days_elapsed"] <= 1)]
consecutive_aids = consecutive_aids[consecutive_aids["days_elapsed"] < 24 * 60 * 60 * 1000]
consecutive_aids

Unnamed: 0,session,aid_x,ts_x,type_x,aid_y,ts_y,type_y,days_elapsed
1,14536702,1716469,1662326129802,clicks,835617,1662326152204,clicks,22402
2,14536702,1716469,1662326129802,clicks,365958,1662326166408,clicks,36606
3,14536702,1716469,1662326129802,clicks,835617,1662326178866,clicks,49064
4,14536702,1716469,1662326129802,clicks,857579,1662326196638,clicks,66836
5,14536702,1716469,1662326129802,clicks,835617,1662326213112,clicks,83310
...,...,...,...,...,...,...,...,...
894,14536702,1716469,1662327384958,clicks,1273367,1662327019988,clicks,364970
895,14536702,1716469,1662327384958,clicks,1273367,1662327121784,clicks,263174
896,14536702,1716469,1662327384958,clicks,1273367,1662327166195,clicks,218763
897,14536702,1716469,1662327384958,clicks,1394329,1662327188640,clicks,196318


- ラスト30に含まれないaidがある？

In [98]:
consecutive_aids["aid_x"].unique()

array([1716469,  835617,  365958,  857579,  819992,  870366, 1394329,
        229849, 1541658,  721434,  683869,  838400, 1273367])

In [76]:
df = test_df[test_df["aid"] == 10357].copy()
df["ts_"] = df["ts"].astype("datetime64[ms]")
df

Unnamed: 0,session,aid,ts,type,ts_
6814041,14536702,10357,1662324443085,clicks,2022-09-04 20:47:23.085


In [89]:
a = test_df[test_df["session"] == 14536702]
a = a.merge(a, on="session")
a = a[(a["aid_x"] != a["aid_y"])]
a["days_elapsed"] = abs(a["ts_x"] - a["ts_y"]) < 24 * 60 * 60 * 1000
a = a[a["days_elapsed"] == True][['session', 'aid_x', 'aid_y', 'ts_x', 'type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
a[a["aid_x"] == 10357]

Unnamed: 0,session,aid_x,aid_y,ts_x,type_y
2688,14536702,10357,1196569,1662324443085,clicks
2689,14536702,10357,1716469,1662324443085,clicks
2690,14536702,10357,1019678,1662324443085,clicks
2692,14536702,10357,229849,1662324443085,clicks
2693,14536702,10357,482569,1662324443085,clicks
2694,14536702,10357,532827,1662324443085,clicks
2695,14536702,10357,940014,1662324443085,clicks
2696,14536702,10357,770650,1662324443085,clicks
2698,14536702,10357,214671,1662324443085,clicks
2699,14536702,10357,1628069,1662324443085,clicks


In [52]:
set(sessions) - set(top_40_sessions)

{12911752,
 12955034,
 12975575,
 13001414,
 13002929,
 13007872,
 13086982,
 13091290,
 13132452,
 13152390,
 13154863,
 13164756,
 13191650,
 13197193,
 13213805,
 13241338,
 13252199,
 13284745,
 13296333,
 13318699,
 13336536,
 13382884,
 13383637,
 13385137,
 13387720,
 13472551,
 13473366,
 13498183,
 13506257,
 13513662,
 13522732,
 13568916,
 13585284,
 13592044,
 13598867,
 13612625,
 13642124,
 13656324,
 13678857,
 13701527,
 13712408,
 13731459,
 13760834,
 13821368,
 13830682,
 13844795,
 13846449,
 13862179,
 13864579,
 13900530,
 13963249,
 13998906,
 14035628,
 14052032,
 14062040,
 14063766,
 14070143,
 14085033,
 14102531,
 14106124,
 14115152,
 14117046,
 14118929,
 14125897,
 14132299,
 14141659,
 14164674,
 14206424,
 14208763,
 14212927,
 14295428,
 14302392,
 14311387,
 14352634,
 14355096,
 14379025,
 14397375,
 14415770,
 14440013,
 14464939,
 14496226,
 14527438,
 14540026,
 14540582,
 14542997,
 14543628,
 14549062}

In [33]:
multiindex_df = test_df.copy()
multiindex_df.index = pd.MultiIndex.from_frame(test_df[["session"]])
multiindex_df["session"].unique().shape[0]

1671803

### test set inference

In [8]:
top_40_cnt = {aid: Counter(dict(top)) for aid, top in top_40.items()}

In [19]:
def suggest_aids(df):
    aids = df.tail(20).aid.tolist()
    
    if len(aids) >= 20:
        # We have enough events in the test session
        return aids
    
    # Append it with AIDs from the co-visitation matrix. 
    aids = set(aids)
    new_aids = Counter()
    for aid in aids:
        new_aids.update(top_40_cnt.get(aid))
    
    top_aids2 = [aid2 for aid2, cnt in new_aids.most_common(20) if aid2 not in aids]        
    return list(aids) + top_aids2[:20 - len(aids)]

In [20]:
pred_df = test_df.sort_values(["session", "type", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_aids(x)
)
pred_df.head()

session
12899779                                              [59625]
12899780    [1142000, 736515, 582732, 973453, 1502122, 158...
12899781    [199008, 57315, 141736, 918667, 194067, 146057...
12899782    [406001, 889671, 834354, 889671, 1099390, 9873...
12899783    [255297, 1114789, 1817895, 198385, 1729553, 17...
dtype: object

In [31]:
data_list = []

for session_id, sorted_aids in tqdm(pred_df.reset_index().to_numpy()):
    sorted_aids_20_str = " ".join(map(str, sorted_aids))
    data_list.append([f"{session_id}_clicks", sorted_aids_20_str])
    data_list.append([f"{session_id}_carts", sorted_aids_20_str])
    data_list.append([f"{session_id}_orders", sorted_aids_20_str])

submit_df = pd.DataFrame(data_list, columns=["session_type", "labels"])
submit_df

  0%|          | 0/1671803 [00:00<?, ?it/s]

Unnamed: 0,session_type,labels
0,12899779_clicks,59625
1,12899779_carts,59625
2,12899779_orders,59625
3,12899780_clicks,1142000 736515 582732 973453 1502122 1586171 4...
4,12899780_carts,1142000 736515 582732 973453 1502122 1586171 4...
...,...,...
5015404,14571580_carts,202353
5015405,14571580_orders,202353
5015406,14571581_clicks,1100210 219925 1547466 462056 1571699 618373 1...
5015407,14571581_carts,1100210 219925 1547466 462056 1571699 618373 1...


In [33]:
Path("../../output/exp003").mkdir(exist_ok=True)
submit_df.to_csv("../../output/exp003/sub_exp003_sampling_100.csv", index=False)