In [1]:
import pandas as pd
from fontTools.subset import subset
from pandas.core.common import random_state

train_df = pd.read_parquet('../data/train.parquet')
test_df = pd.read_parquet('../data/test.parquet')
all_df = pd.concat([train_df, test_df])

In [2]:
import numpy as np

fraction_of_sessions_to_use = 1
if fraction_of_sessions_to_use != 1 :
    lucky_sessions_train = train_df.drop_duplicates(['session']).sample(frac=fraction_of_sessions_to_use,random_state=42)['session']
    subset_of_train = train_df[train_df.session.isin(lucky_sessions_train)]
    lucky_sessions_test = test_df.drop_duplicates(['session'])['session']
    subset_of_test = test_df[test_df.session.isin(lucky_sessions_test)]
else :
    subset_of_train = train_df
    subset_of_test = test_df

subset_of_train.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0


In [3]:
subset_of_train.index = pd.MultiIndex.from_frame(subset_of_train[['session']])

In [4]:
subset_of_test.index = pd.MultiIndex.from_frame(subset_of_test[['session']])

In [5]:
chunk_size = 30000
min_ts = train_df.ts.min()
max_ts = train_df.ts.max()

from collections import defaultdict,Counter

next_AIDs = defaultdict(Counter)
subsets = pd.concat([subset_of_train, subset_of_test])
sessions = subsets.session.unique()
sessions

array([       0,        1,        2, ..., 14571579, 14571580, 14571581],
      dtype=int32)

In [6]:
subsets

Unnamed: 0_level_0,session,aid,ts,type
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,1517085,1659304800,0
0,0,1563459,1659304904,0
0,0,1309446,1659367439,0
0,0,16246,1659367719,0
0,0,1781822,1659367871,0
...,...,...,...,...
14571577,14571577,1141710,1662328774,0
14571578,14571578,519105,1662328775,0
14571579,14571579,739876,1662328775,0
14571580,14571580,202353,1662328781,0


In [10]:
from tqdm import tqdm

for i in tqdm(range(0,sessions.shape[0],chunk_size)):
    current_chunk = subsets.loc[sessions[i]:sessions[min(sessions.shape[0]-1,i+chunk_size-1)]].reset_index(drop=True)
    current_chunk = current_chunk.groupby('session',as_index=False).nth(list(range(-30,0))).reset_index(drop=True)
    consecutive_AIDs = current_chunk.merge(current_chunk,on='session')
    consecutive_AIDs = consecutive_AIDs[consecutive_AIDs['aid_x'] != consecutive_AIDs['aid_y']]
    consecutive_AIDs['days_elapsed'] = (consecutive_AIDs.ts_y - consecutive_AIDs.ts_x) / (24*60*60)
    consecutive_AIDs = consecutive_AIDs[(consecutive_AIDs.days_elapsed >= 0) & (consecutive_AIDs.days_elapsed <= 1)]
    for aid_x , aid_y in zip(consecutive_AIDs['aid_x'], consecutive_AIDs['aid_y']):
        next_AIDs[aid_x][aid_y] += 1

del train_df,subset_of_train,subsets

session_type = ['clicks','carts','orders']
test_session_AIDs = test_df.reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = test_df.reset_index(drop=True).groupby('session')['type'].apply(list)

test_session_AIDs

100%|██████████| 486/486 [13:07<00:00,  1.62s/it]


session
12899779                                              [59625]
12899780           [1142000, 582732, 973453, 736515, 1142000]
12899781    [141736, 199008, 57315, 194067, 199008, 199008...
12899782    [1669402, 1494780, 1494780, 1494780, 1494780, ...
12899783    [255297, 1114789, 255297, 300127, 198385, 3001...
                                  ...                        
14571577                                            [1141710]
14571578                                             [519105]
14571579                                             [739876]
14571580                                             [202353]
14571581                                            [1100210]
Name: aid, Length: 1671803, dtype: object

In [11]:
from tqdm import tqdm
labels = []
no_data = 0
no_data_all_aids = 0
type_weight_multipliers = {0:1,1:6,2:3}
for AIDs , types in tqdm(zip(test_session_AIDs,test_session_types),total=len(test_session_types)):
    if len(AIDs) >= 20 :
        weights = np.logspace(0.1,1,len(AIDs),base=2,endpoint=True)
        aids_temp = defaultdict(lambda : 0)
        for aid,w,t in zip(AIDs,weights,types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in sorted(aids_temp.items(),key=lambda item:-item[1])]
        labels.append(sorted_aids[:20])
    else :
        AIDs = list(dict.fromkeys(AIDs[::-1]))
        AIDs_len_start = len(AIDs)

        candidates = []
        for AID in AIDs:
            if AID in next_AIDs: candidates += [aid for aid, count in next_AIDs[AID].most_common(20)]
        AIDs += [AID for AID, cnt in Counter(candidates).most_common(40) if AID not in AIDs]

        labels.append(AIDs[:20])
        if candidates == []: no_data += 1
        if AIDs_len_start == len(AIDs): no_data_all_aids += 1


100%|██████████| 1671803/1671803 [33:20<00:00, 835.65it/s] 


In [15]:
# >>> outputting results to CSV

labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_session_AIDs.index, 'labels': labels_as_strings})

prediction_dfs = []

for st in tqdm(session_type):
    modified_predictions = predictions.copy()
    modified_predictions.session_type = modified_predictions.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(modified_predictions)

submission = pd.concat(prediction_dfs).reset_index(drop=True)
# 将 labels 列中的数组转换为逗号分隔的字符串
submission['labels'] = submission['labels'].apply(lambda x: ' '.join(map(str, x)))
submission.to_csv('./submission.csv', index=False)

print(f'Test sessions that we did not manage to extend based on the co-visitation matrix: {no_data_all_aids}')

KeyboardInterrupt: 

In [13]:
len(submission)

5015409