# exp002
- ref: [Co-occurrence baseline](https://www.kaggle.com/code/whitelily/co-occurrence-baseline)

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import pickle

In [2]:
input_dir = Path("../../input")

def get_input_data(input_dir: Path, phase: str):
    dfs = []
    for path in sorted(list(input_dir.glob(f"{phase}_parquet/*.parquet"))):
        dfs.append(pd.read_parquet(path))
    
    return pd.concat(dfs, axis=0)

train_df = get_input_data(input_dir, "train")
test_df = get_input_data(input_dir, "test")

In [3]:
removed_dup_train_df = train_df.drop_duplicates(["session", "aid"])[["session", "aid"]].reset_index(drop=True)
removed_dup_test_df = test_df.drop_duplicates(["session", "aid"])[["session", "aid"]].reset_index(drop=True)
aid_set_test = set(removed_dup_test_df["aid"].unique())

In [30]:
df = removed_dup_train_df.iloc[:2000]
cross = pd.crosstab(df["aid"], df["session"])
cross

session,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1303,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2512,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3999,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4203,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1853641,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1853968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1854655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
cross.dot(cross.T)

aid,1303,2137,2512,3999,4203,6026,6851,7563,8268,9268,...,1847114,1847290,1847710,1848052,1852696,1853288,1853641,1853968,1854655,1855215
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1303,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2137,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2512,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3999,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4203,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1853288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1853641,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1853968,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
1854655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
# make co-occur dict
size = 1000
train_co_occur_count = defaultdict(lambda: defaultdict(int))

for i in tqdm(range(0, 200000, size)):
    f1 =  i <= removed_dup_train_df["session"]
    f2 = removed_dup_train_df["session"] < i + size
    partial_train = removed_dup_train_df[f1 & f2]
    temp = pd.merge(partial_train,partial_train, on = ["session"])
    
    
    # For memory problem, only set value for aid in testdata
    f1 = temp["aid_x"].isin(aid_set_test)
    f2 = temp["aid_y"].isin(aid_set_test)
    temp = temp[f1 | f2]
    
    temp =temp[["aid_x","aid_y"]].value_counts().reset_index()
    for x, y, co_count in  zip(temp["aid_x"].to_numpy(), temp["aid_y"].to_numpy(), temp[0].to_numpy()):
        if x in aid_set_test:
            train_co_occur_count[x][y] += co_count
        if y in aid_set_test:
            train_co_occur_count[y][x] += co_count

for aid in aid_set_test:
    train_co_occur_count[aid] = sorted(train_co_occur_count[aid].items(),key = lambda x:-x[1])[:20]

  0%|          | 0/200 [00:00<?, ?it/s]

In [21]:
import pickle

for aid in aid_set_test:
    train_co_occur_count[aid] = sorted(train_co_occur_count[aid].items(),key = lambda x:-x[1])[:20]

In [None]:
with open('../../output/exp002/train_co_occur_count.pickle', 'wb') as fi:
    pickle.dump(dict(train_co_occur_count), fi)