In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import cudf
import glob
import gc

import pandas as pd
import numpy as np

from tqdm import tqdm

In [3]:
lb = False
num_items = 10000
num_items = None
path = '../../data/'
    
df_type = cudf.DataFrame({
    'type': ['clicks', 'carts', 'orders'],
    'type_': [1, 2, 3]
})

In [4]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools

type_weight_multipliers = {1: 1, 2: 6, 3: 3}

def suggest_clicks(df):
    # USE USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result #+ list(top_clicks)[:20-len(result)]

def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    unique_clicks = list(dict.fromkeys(df.loc[(df['type']==1)].aid.tolist()[::-1]))
    df = df.loc[(df['type']==2)|(df['type']==3)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result #+ list(top_orders)[:20-len(result)]

In [5]:
df = cudf.read_parquet('../../data/test.parquet')
df = df.merge(
    df_type,
    how='left',
    on='type'
)
df['session'] = df['session'].astype('int32')
df['aid'] = df['aid'].astype('int32')
df.ts = (df.ts/1000).astype('int32')
df.drop(['type'], axis=1, inplace=True)
df = df.rename(columns={'type_': 'type'})

In [6]:
df = df.sort_values(['session','ts'],ascending=[True,False])
# USE TAIL OF SESSION
# df = df.reset_index(drop=True)
# df['n'] = df.groupby('session').cumcount()
# df = df.loc[df.n<11].drop('n',axis=1)

In [7]:
top_clicks = df.to_pandas().loc[df.to_pandas()['type']==1,'aid'].value_counts().index.values[:20]
top_orders = df.to_pandas().loc[df.to_pandas()['type']==3,'aid'].value_counts().index.values[:20]

In [8]:
def pqt_to_dict(df):
    df = df.sort_values(['aid_x','wgt'],ascending=[True,False])
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

top_20_clicks = pqt_to_dict( pd.read_parquet('./data/sub/top_20_clicks_v3.parquet'))
top_20_buys = pqt_to_dict( pd.read_parquet('./data/sub/top_15_carts_orders_v3.parquet'))
top_20_buy2buy = pqt_to_dict( pd.read_parquet('./data/sub/top_15_buy2buy_v3.parquet'))
#top_20_c2o = pqt_to_dict( pd.read_parquet('./data/top_15_c2o_v3.parquet'))

In [10]:
from pandarallel import pandarallel

pandarallel.initialize()

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [12]:
%%time
pred_df_clicks = df.sort_values(["session", "ts"]).to_pandas().groupby(["session"]).parallel_apply(
    lambda x: suggest_clicks(x)
)

CPU times: user 2min 44s, sys: 18.9 s, total: 3min 3s
Wall time: 3min 20s


In [13]:
%%time

pred_df_buys = df.sort_values(["session", "ts"]).to_pandas().groupby(["session"]).parallel_apply(
    lambda x: suggest_buys(x)
)

CPU times: user 3min 5s, sys: 38.2 s, total: 3min 43s
Wall time: 5min 26s


In [14]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()

In [15]:
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [16]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])

In [17]:
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))

In [18]:
pred_df.shape

(5015409, 2)

In [19]:
pred_df.to_csv('./submission.csv', index=False)

In [20]:
pred_df

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...
...,...,...
1671798,14571577_carts,1141710 1276792 1004292 1666114 367734 935830 ...
1671799,14571578_carts,519105 977826 1811714 822641 1671592 815460 15...
1671800,14571579_carts,739876 1209992 1750859 1550479 785544 51363 83...
1671801,14571580_carts,202353 1314576 433425 1231403 871658 891417 92...


In [None]:
score = 0
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
for t in ['clicks', 'carts', 'orders']:
    sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('./data/xgb_test_y.parquet')
    test_labels = test_labels[['session', 'aid', 'type']].groupby(['session', 'type']).agg(list).reset_index()
    test_labels = test_labels.loc[test_labels['type']==t]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.aid).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.aid.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    score += weights[t]*recall
    print(f'{t} recall =',recall)