In [1]:
import pandas as pd
import numpy as np
from flask import session

train_df = pd.read_parquet("../data/train.parquet")
test_df = pd.read_parquet("../data/test.parquet")
print(train_df.shape)
print(test_df.shape)

(216716096, 4)
(6928123, 4)


In [2]:
train_df

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
216716091,12899776,1737908,1661723987,0
216716092,12899777,384045,1661723976,0
216716093,12899777,384045,1661723986,0
216716094,12899778,561560,1661723983,0


In [3]:
fraction_of_data = 1
if fraction_of_data < 1 :
    selected_session_train = train_df.drop_duplicates(["session"]).sample(frac=fraction_of_data,random_state=42)['session']
    sampled_train_df = train_df[train_df['session'].isin(selected_session_train)]
    selected_session_test = test_df.drop_duplicates(["session"]).sample(frac=fraction_of_data,random_state=42)['session']
    sampled_test_df = test_df[test_df['session'].isin(selected_session_test)]
else :
    sampled_train_df = train_df
    sampled_test_df = test_df


In [4]:
from collections import defaultdict,Counter

next_aids = defaultdict(Counter)

In [5]:
from tqdm import tqdm
all_df = pd.concat([sampled_train_df,sampled_test_df])
chunk_size = 300000
total = len(all_df)
for i in tqdm(range(0,total,chunk_size)):
    chunk = all_df[i:min(i+chunk_size,total)]
    chunk = chunk.groupby("session").nth(list(range(-30,0)))
    chunk = chunk.merge(chunk,on="session")
    chunk = chunk[chunk['aid_x'] != chunk['aid_y']]
    chunk['time_span'] = (chunk['ts_y'] - chunk['ts_x']) / (24*60*60)
    chunk = chunk[(chunk['time_span'] >= 0) & (chunk['time_span'] <= 1)]
    for aid_x, aid_y in zip(chunk['aid_x'],chunk['aid_y']):
        next_aids[aid_x][aid_y] += 1


100%|██████████| 746/746 [10:20<00:00,  1.20it/s]


In [13]:
test_session_aids = test_df.groupby("session")['aid'].apply(list)
test_session_types = test_df.groupby("session")['type'].apply(list)

In [7]:
test_session_types

session
12899779                                                  [0]
12899780                                      [0, 0, 0, 0, 0]
12899781                    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
12899782    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
12899783                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                                  ...                        
14571577                                                  [0]
14571578                                                  [0]
14571579                                                  [0]
14571580                                                  [0]
14571581                                                  [0]
Name: type, Length: 1671803, dtype: object

In [8]:
labels = []
for aids , types in tqdm(zip(test_session_aids,test_session_types),total=len(test_session_aids)) :
    if len(aids) < 20 :
        candidates = []
        for aid in aids:
            if aid in next_aids :
                candidates += [key for key,val in next_aids[aid].most_common(20)]
        aids += [key for key , val in Counter(candidates).most_common(40) if key not in aids]
        labels.append(aids[:20])
    else :
        weights = np.logspace(0.1,1,base=2,endpoint=True)
        type_weights = {0 : 1, 1 : 6 , 2 : 3}
        scores = defaultdict(int)
        for aid, w, t in zip(aids,weights,types):
            scores[aid] += w * type_weights[t]
        candidates = [k for k , val in sorted(scores.items(),key=lambda x : - x[1])]
        labels.append(candidates[:20])



100%|██████████| 1671803/1671803 [37:25<00:00, 744.54it/s]  


In [9]:
predictions = pd.DataFrame(data={"session_type" : test_session_aids.index, "labels" : labels})
session_types = ['clicks','carts','orders']
submission = []
for session_type in tqdm(session_types) :
    cur_sub = predictions.copy()
    cur_sub['session_type'] = predictions['session_type'].astype(str) + f'_{session_type}'
    submission.append(cur_sub)
submission = pd.concat(submission).set_index('session_type')
submission

100%|██████████| 3/3 [00:01<00:00,  2.27it/s]


Unnamed: 0_level_0,labels
session_type,Unnamed: 1_level_1
12899779_clicks,"[59625, 894169, 737445, 1246235, 499621, 46928..."
12899780_clicks,"[1142000, 582732, 973453, 736515, 1142000, 889..."
12899781_clicks,"[141736, 199008, 57315, 194067, 199008, 199008..."
12899782_clicks,"[779477, 834354, 889671, 476063, 975116, 59599..."
12899783_clicks,"[255297, 1114789, 255297, 300127, 198385, 3001..."
...,...
14571577_orders,"[1141710, 1276792, 1666114, 367734, 1004292, 8..."
14571578_orders,"[519105, 977826, 1811714, 815460, 735459, 8226..."
14571579_orders,"[739876, 1209992, 1550479, 1750859, 785544, 14..."
14571580_orders,"[202353, 925638, 1231403, 871658, 721096, 8882..."


In [10]:
submission = submission['labels'].apply(lambda x : ' '.join(map(str,x)))

In [11]:
submission

session_type
12899779_clicks    59625 894169 737445 1246235 499621 469285 9415...
12899780_clicks    1142000 582732 973453 736515 1142000 889686 48...
12899781_clicks    141736 199008 57315 194067 199008 199008 19900...
12899782_clicks    779477 834354 889671 476063 975116 595994 1344...
12899783_clicks    255297 1114789 255297 300127 198385 300127 172...
                                         ...                        
14571577_orders    1141710 1276792 1666114 367734 1004292 86916 8...
14571578_orders    519105 977826 1811714 815460 735459 822641 167...
14571579_orders    739876 1209992 1550479 1750859 785544 1434015 ...
14571580_orders    202353 925638 1231403 871658 721096 888228 433...
14571581_orders    1100210 1684953 622489 1158237 462056 1571699 ...
Name: labels, Length: 5015409, dtype: object

In [12]:
submission.to_csv("./submission.csv")