In [1]:
VER = 1
import pandas as pd, numpy as np
import pickle, glob, gc

from collections import Counter
import itertools

# multiprocessing 
import psutil
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 8


In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test(files):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(files)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

valid = load_test('/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-validation/test_parquet/*')
print('Valid data has shape',valid.shape)
valid.head()


Valid data has shape (7683577, 4)


Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0


In [3]:
%%time

DISK_PIECES = 4
# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

top_20_clicks = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_20_valid_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_20_valid_clicks_v{VER}_{k}.pqt') ) )


top_20_buys = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_valid_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_valid_carts_orders_v{VER}_{k}.pqt') ) )
    
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_valid_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = valid.loc[valid['type']==0, 'aid'].value_counts().index.values[:20]
top_orders = valid.loc[valid['type']==2, 'aid'].value_counts().index.values[:20]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1812132 1055146 1812132
CPU times: user 1min 17s, sys: 3.8 s, total: 1min 21s
Wall time: 1min 39s


In [4]:
def df_parallelize_run(func, t_split):
    
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [6]:
%%time
PIECES = 5
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-valid-test-list/valid_group_tolist_{PART}_{VER}.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

1801251
CPU times: user 7.52 s, sys: 473 ms, total: 7.99 s
Wall time: 13.1 s


In [7]:
#type_weight_multipliers = {'clicks': 1, 'carts': 6, 'orders': 3}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_clicks(df):
    
    session = df[0]
    aids = df[1]
    types = df[2]
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return session, sorted_aids
    
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(20) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    
    # USE TOP20 TEST CLICKS
    return session, result + list(top_clicks)[:20-len(result)]

In [8]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_clicks, valid_bysession_list)
val_clicks = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 20.4 s, sys: 3.04 s, total: 23.4 s
Wall time: 27.2 s


In [9]:
def suggest_buys(df):
    # USE USER HISTORY AIDS AND TYPES
    session = df[0]
    aids = df[1]
    types = df[2]

    unique_aids = list(dict.fromkeys(aids[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))

    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=20:
        
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(20)]
        return session, sorted_aids
            
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2 + aids3).most_common(20) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:20 - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return session, result + list(top_orders)[:20-len(result)]

In [10]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_buys, valid_bysession_list)
val_buys = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 24.2 s, sys: 3.95 s, total: 28.1 s
Wall time: 38.4 s


In [13]:
%%time
valid_labels = pd.read_parquet('/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-validation/test_labels.parquet')

CPU times: user 624 ms, sys: 334 ms, total: 957 ms
Wall time: 3.65 s


In [15]:
benchmark = {"clicks":0.5255597442145808, "carts":0.4093328152483512, "orders":0.6487936598117477, "all":.5646320148830121}
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}

valid_labels = pd.read_parquet('/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-validation/test_labels.parquet')


def hits(b):
    # b[0] : session id
    # b[1] : ground truth
    # b[2] : aids prediction 
    return b[0], len(set(b[1]).intersection(set(b[2]))), np.clip(len(b[1]), 0, 20)

def otto_metric_piece(values, typ, verbose=True):
    
    c1 = pd.DataFrame(values, columns=["labels"]).reset_index().rename({"index":"session"}, axis=1)
    a = valid_labels.loc[valid_labels['type']==typ].merge(c1, how='left', on=['session'])

    b=[[a0, a1, a2] for a0, a1, a2 in zip(a["session"], a["ground_truth"], a["labels"])]
    c = df_parallelize_run(hits, b)
    c = np.array(c)
    
    recall = c[:,1].sum() / c[:,2].sum()
    
    print('{} recall = {:.5f} (vs {:.5f} in benchmark)'.format(typ ,recall, benchmark[typ]))
    
    return recall

def otto_metric(clicks, carts, orders, verbose = True):
    
    score = 0
    score += weights["clicks"] * otto_metric_piece(clicks, "clicks", verbose = verbose)
    score += weights["carts"] * otto_metric_piece(carts, "carts", verbose = verbose)
    score += weights["orders"] * otto_metric_piece(orders, "orders", verbose = verbose)
    
    if verbose:
        print('=============')
        print('Overall Recall = {:.5f} (vs {:.5f} in benchmark)'.format(score, benchmark["all"]))
        print('=============')
    
    return score

In [16]:
%%time
_ = otto_metric_piece(val_buys, "orders")

orders recall = 0.64896 (vs 0.64879 in benchmark)
CPU times: user 3.15 s, sys: 2.22 s, total: 5.38 s
Wall time: 5.21 s


In [17]:
%%time
_ = otto_metric_piece(val_buys, "carts")

carts recall = 0.40953 (vs 0.40933 in benchmark)
CPU times: user 5.46 s, sys: 3.16 s, total: 8.63 s
Wall time: 8.27 s


In [18]:
%%time
_ = otto_metric(val_clicks, val_buys, val_buys)

clicks recall = 0.52557 (vs 0.52556 in benchmark)
carts recall = 0.40953 (vs 0.40933 in benchmark)
orders recall = 0.64896 (vs 0.64879 in benchmark)
Overall Recall = 0.56479 (vs 0.56463 in benchmark)
CPU times: user 31.3 s, sys: 9.12 s, total: 40.4 s
Wall time: 40.2 s


In [19]:
%%time

# Two columns difference
# in each column, aids
# if aids are the same, columns difference gives 0
# and then, count zeroes
def comp(a, c):
    n = a - c
    return n[n==0].shape[0]

for typ in ["carts", "orders"]:

    # Session number of all sessions candidates
    session_list = np.array([ f[0] for f in temp ], dtype = np.int32).flatten()
    
    # candidates for each session, filled with some -1 to complete lines to have a rectangular matrix
    preds = np.array([ f[1] + [-1]*(20-len(f[1] )) for f in temp ], dtype = np.int32)
    # keep only candidates for sessions which are in ground truth sessions
    preds = preds[np.isin(session_list, valid_labels.loc[(valid_labels['type'] == typ), ["session"]].values.flatten())] 
    
    # Ground Truth
    gtv = valid_labels.loc[valid_labels['type'] == typ].ground_truth.values

    # How many actions on each session ground truth ?
    gtv_lens = np.array([len(e) for e in list(gtv)])
    gtv_lens.max()
    
    # Main loop
    cpt = 0
    # for each column in ground truth
    for i in range(gtv_lens.max()):
        y_val = np.array([e[i] for e in list(gtv[gtv_lens > i])])
        # for each column in candidates
        for j in range(20):
            # number of matching candidates with ground_truth
            cpt += comp(preds[gtv_lens > i, j], y_val)
    
    print("{} recall {:.5f} (versus benchmark {:.5f})".format(typ, cpt / gtv_lens.sum(), benchmark[typ]))


carts recall 0.40764 (versus benchmark 0.40933)
orders recall 0.64833 (versus benchmark 0.64879)
CPU times: user 19.2 s, sys: 846 ms, total: 20 s
Wall time: 19.8 s


In [20]:
typ = "carts"
# Session number of all sessions candidates
session_list = np.array([ f[0] for f in temp ], dtype = np.int32).flatten()
    
# candidates for each session, filled with some -1 to complete lines to have a rectangular matrix
preds = np.array([ f[1] + [-1]*(20-len(f[1] )) for f in temp ], dtype = np.int32)

In [21]:
%%time
# keep only candidates for sessions which are in ground truth sessions
preds = preds[np.isin(session_list, valid_labels.loc[(valid_labels['type'] == typ), ["session"]].values.flatten())] 
    
# Ground Truth
gtv = valid_labels.loc[valid_labels['type'] == typ].ground_truth.values

# How many actions on each session ground truth ?
gtv_lens = np.array([len(e) for e in list(gtv)])
gtv_lens.max()
    
# Main loop
cpt = 0
# for each column in ground truth
for i in range(gtv_lens.max()):
    y_val = np.array([e[i] for e in list(gtv[gtv_lens > i])])
    # for each column in candidates
    for j in range(20):
        # number of matching candidates with ground_truth
        cpt += comp(preds[gtv_lens > i, j], y_val)
    
print("{} recall {:.5f} (versus benchmark {:.5f})".format(typ, cpt / gtv_lens.sum(), benchmark[typ]))

carts recall 0.40764 (versus benchmark 0.40933)
CPU times: user 2.51 s, sys: 18.5 ms, total: 2.53 s
Wall time: 2.52 s


In [22]:
typ = "orders"
# Session number of all sessions candidates
session_list = np.array([ f[0] for f in temp ], dtype = np.int32).flatten()
    
# candidates for each session, filled with some -1 to complete lines to have a rectangular matrix
preds = np.array([ f[1] + [-1]*(20-len(f[1] )) for f in temp ], dtype = np.int32)

In [23]:
%%time
# keep only candidates for sessions which are in ground truth sessions
preds = preds[np.isin(session_list, valid_labels.loc[(valid_labels['type'] == typ), ["session"]].values.flatten())] 
    
# Ground Truth
gtv = valid_labels.loc[valid_labels['type'] == typ].ground_truth.values

# How many actions on each session ground truth ?
gtv_lens = np.array([len(e) for e in list(gtv)])
gtv_lens.max()
    
# Main loop
cpt = 0
# for each column in ground truth
for i in range(gtv_lens.max()):
    y_val = np.array([e[i] for e in list(gtv[gtv_lens > i])])
    # for each column in candidates
    for j in range(20):
        # number of matching candidates with ground_truth
        cpt += comp(preds[gtv_lens > i, j], y_val)
    
print("{} recall {:.5f} (versus benchmark {:.5f})".format(typ, cpt / gtv_lens.sum(), benchmark[typ]))

orders recall 0.64833 (versus benchmark 0.64879)
CPU times: user 1.02 s, sys: 6.99 ms, total: 1.03 s
Wall time: 1.02 s


In [24]:
temp = df_parallelize_run(suggest_clicks, valid_bysession_list)
val_clicks = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])


In [25]:
%%time
for typ in ["clicks"]:

    # Session number of all sessions candidates
    session_list = np.array([ f[0] for f in temp ], dtype = np.int32).flatten()
    
    # candidates for each session, filled with some -1 to complete lines to have a rectangular matrix
    preds = np.array([ f[1] + [-1]*(20-len(f[1] )) for f in temp ], dtype = np.int32)
    # keep only candidates for sessions which are in ground truth sessions
    preds = preds[np.isin(session_list, valid_labels.loc[(valid_labels['type'] == typ), ["session"]].values.flatten())] 
    
    # Ground Truth
    gtv = valid_labels.loc[valid_labels['type'] == typ].ground_truth.values

    # How many actions on each session ground truth ?
    gtv_lens = np.array([len(e) for e in list(gtv)])
    gtv_lens.max()
    
    # Main loop
    cpt = 0
    # for each column in ground truth
    for i in range(gtv_lens.max()):
        y_val = np.array([e[i] for e in list(gtv[gtv_lens > i])])
        # for each column in candidates
        for j in range(20):
            # number of matching candidates with ground_truth
            cpt += comp(preds[gtv_lens > i, j], y_val)
    
    print("{} recall {:.5f} (versus benchmark {:.5f})".format(typ, cpt / gtv_lens.sum(), benchmark[typ]))

clicks recall 0.52557 (versus benchmark 0.52556)
CPU times: user 6.89 s, sys: 194 ms, total: 7.09 s
Wall time: 7.03 s


In [26]:
del temp
_ = gc.collect()

In [27]:
# FREE MEMORY
del valid_bysession_list, val_clicks, val_buys
del top_20_clicks, top_20_buy2buy, top_20_buys, top_clicks, top_orders, valid
_ = gc.collect()

In [28]:
test = load_test('/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-chunk-data-inparquet-format/test_parquet/*')
print('Test data has shape',test.shape)
test.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [29]:
%%time

top_20_clicks = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_20_test_clicks_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_20_test_clicks_v{VER}_{k}.pqt') ) )


top_20_buys = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_test_carts_orders_v{VER}_0.pqt') )
for k in range(1, DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_test_carts_orders_v{VER}_{k}.pqt') ) )
    
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-co-visitation-matrices/top_15_test_buy2buy_v{VER}_0.pqt') )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test.loc[test['type']==0, 'aid'].value_counts().index.values[:20]
top_orders = test.loc[test['type']==2, 'aid'].value_counts().index.values[:20]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 1min 16s, sys: 4.97 s, total: 1min 21s
Wall time: 1min 42s


In [30]:
%%time
PIECES = 5
test_bysession_list = []
for PART in range(PIECES):
    with open(f'/content/drive/MyDrive/OTTO/OTTO_Fast_Handcrafted_model/input/otto-valid-test-list/test_group_tolist_{PART}_{VER}.pkl', 'rb') as f:
        test_bysession_list.extend(pickle.load(f))
print(len(test_bysession_list))

1671803
CPU times: user 7.34 s, sys: 570 ms, total: 7.91 s
Wall time: 11.2 s


In [31]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_clicks, test_bysession_list)
clicks_pred_df = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
clicks_pred_df = clicks_pred_df.add_suffix("_clicks")
clicks_pred_df.head()

CPU times: user 20.1 s, sys: 3.96 s, total: 24.1 s
Wall time: 31.1 s


12899779_clicks    [59625, 1253524, 737445, 438191, 731692, 17907...
12899780_clicks    [1142000, 736515, 973453, 582732, 1502122, 889...
12899781_clicks    [918667, 199008, 194067, 57315, 141736, 146057...
12899782_clicks    [834354, 595994, 740494, 889671, 987399, 77947...
12899783_clicks    [1817895, 607638, 1754419, 1216820, 1729553, 3...
dtype: object

In [32]:
%%time

# Predict on all sessions in parallel
temp = df_parallelize_run(suggest_buys, test_bysession_list)
buys_pred_df = pd.Series([f[1] for f in temp], index=[f[0] for f in temp])
orders_pred_df = buys_pred_df.add_suffix("_orders")
carts_pred_df = buys_pred_df.add_suffix("_carts")

CPU times: user 26 s, sys: 5.17 s, total: 31.2 s
Wall time: 38 s


In [34]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df]).reset_index()
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("/content/drive/MyDrive/OTTO/submission/submission_575.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...
