# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [1]:
VER = 564

ITEM_CT = 50
W_VER = 33

In [2]:
### import numpy as np
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np, gc
import multiprocessing
import os
import pickle

import glob
from collections import Counter

## Test set inference

In [3]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(tqdm(glob.glob('/raid/Kaggle/otto/valid/test_parquet/*'))):
        chunk = pd.read_parquet(chunk_file)
        #chunk.ts *= 1000
        dfs.append(chunk)

    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

In [4]:
#test_df = load_test()
test_df = pd.read_parquet('test_with_d.parquet')

In [6]:
#test_df.ts = (test_df.ts/1000).astype('int32')

In [7]:
test_df.head()

Unnamed: 0,session,aid,ts,type,d
0,11098528,11830,1661119200,0,0
1,11098529,1105029,1661119200,0,0
2,11098530,264500,1661119200,0,0
3,11098530,264500,1661119288,0,0
4,11098530,409236,1661119369,0,0


In [8]:
# COMPUTED FROM TEST DATA
top_orders = [ 876493, 1406660, 1236775,  166037, 1460571, 1531805,  836852,
        634452,  923948, 1043508,  832192,  801774,  258353,  332654,
       1596897,   29735,  544144, 1581568, 1006198, 1462420,  331708,
        231487,  642804,  986164, 1116095,  329725, 1022566, 1604220,
       1733943,  289406,  631899,  162064,  122983,  247240, 1436280,
        954951, 1445562,  373490, 1568011,  172856,  756588,  184976,
       1534690,  508883, 1310373,  530377, 1457846, 1196256,  321547,
        982423,  892871, 1441266, 1294924, 1685214, 1609228,  576949,
        714524,   56279, 1102089,  314210, 1125638, 1768724, 1734305,
        898836, 1188425,  145332,  688602,  527209, 1647157,   42628,
       1628069, 1722991,  409620,  102416, 1476166, 1734475, 1182614,
        926412,  988767,  137514,  500609,  479970,  165160,  443425,
        302096, 1257293,  823143,  856779, 1030009,  304807, 1164832,
         78262, 1025795, 1133209,  881286, 1620020,  742601,  108125,
        558573, 1107897]
top_carts = [ 485256,   33343,  613493,  152547,  876493, 1406660, 1736857,
        166037, 1460571, 1236775,  554660,  660655, 1531805, 1022566,
        231487,  122983, 1562705,  923948,  332654,  544144,  832192,
        634452,  322370,  836852,   29735, 1006198, 1043508,  801774,
       1462420,  258353,  530377,  558573, 1116095,  756588,  986164,
        329725, 1568011, 1581568, 1025795, 1445562,   42628, 1733943,
        642804, 1604220,  184976,  892871,  162064,  501077, 1182614,
       1647157,  331708, 1310373, 1534690, 1111967, 1476166, 1257293,
         19665, 1768724,  108125, 1502122,  172856,  982423, 1734475,
        409620, 1586171, 1685214, 1662401,  881286, 1164832,  373490,
        637827, 1125638, 1702657, 1281615,  714524,  500609,  508883,
         78262, 1359971, 1102089, 1734305, 1628069, 1436280, 1596897,
        631899, 1030009, 1497089,  988767,  670066,  289406,  450505,
       1196256,  247240,   16151, 1497245, 1689044,  479970, 1609228,
       1853288, 1636724]
top_clicks = [ 485256, 1460571,  108125, 1551213,   33343,  613493,  876493,
        152547,  184976, 1236775, 1406660,  836852,  331708, 1531805,
         29735,  554660,  634452,  986164,  959208,  832192,  332654,
        620545, 1645990,  166037,  102345, 1116095,  811371,  322370,
       1733943, 1043508, 1019736,  756588,  801774,  231487,  861401,
       1502122, 1030009,  329725, 1624436, 1022566, 1006198,  496180,
       1685214, 1853288,  530377,  659399, 1604220, 1102089, 1586171,
       1497089,  754412, 1765072, 1109824, 1264313,  881286, 1294924,
        199409, 1581568,  385065,  195381,  558573, 1196256, 1782099,
        612920,  435253,  923948,  508883, 1126038, 1647563, 1508062,
       1182614,  544144,  247240,  982423,   57315, 1628069,  674590,
       1798916,  892871,  909449,  337471,  721034, 1734305,   95488,
       1722991, 1754984,  819288, 1462420,  836707, 1796103,   42628,
        642804,  122983,  321547, 1278671,  714524, 1603001,  409620,
        636101, 1702657]

In [9]:
print('Reading top20 AIDs from cache')
top_20 = pickle.load(open('../../data/covisit_matrices/top_80_aids_v93.pkl', 'rb')) #time weight. All2all #added time decay

top_20b = pickle.load(open('../../data/covisit_matrices/top_40_aids_v217_0.pkl', 'rb')) # both ways 1 #td did not help
top_20c = pickle.load(open('../../data/covisit_matrices/top_40_aids_v220_0.pkl', 'rb')) # both ways 2 #td did not help
top_20d = pickle.load(open('../../data/covisit_matrices/top_40_aids_v226_0.pkl', 'rb')) # forward 3 buy #td did not help

top_20e = pickle.load(open('../../data/covisit_matrices/top_40_aids_v232_0.pkl', 'rb')) # both ways 3 #added time decay

top_20f = pickle.load(open('../../data/covisit_matrices/top_40_aids_v235_0.pkl', 'rb')) # both ways 6 #time decay
top_20f.update(pickle.load(open('../../data/covisit_matrices/top_40_aids_v235_1.pkl', 'rb'))) # both ways 6 #time decay

top_20_orders = pickle.load(open('../../data/covisit_matrices/top_40_orders_carts_v95.pkl', 'rb')) #added time decay
top_20_carts = top_20_orders

top_20_buy2buy = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v90.pkl', 'rb')) #td did not help
top_20_buy2buy2 = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v99.pkl', 'rb')) #time decay

top_20_test = pickle.load(open('../../data/covisit_matrices/top_40_aids_v116_0.pkl', 'rb')) #time decay
for k in range(1,3): # recent 3 week time delta decay. all to buy
    top_20_test.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v116_{k}.pkl', 'rb')) )
    
top_20_test2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v115_0.pkl', 'rb')) #time decay
for k in range(1,3): # recent 2 week time delta decay weight
    top_20_test2.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v115_{k}.pkl', 'rb')) )
    
top_20_buy = pickle.load(open('../../data/covisit_matrices/top_40_aids_v239_0.pkl', 'rb'))

Reading top20 AIDs from cache


In [13]:
len( top_20), len( top_20_orders ), len( top_20_buy2buy )

(1812132, 1819763, 1072368)

In [14]:
INCLUDE = ['top20','top20b','top20c','top20d','top20e','top20f','top20orders','top20buy2buy','top20buy2buy2',
          'top20test','top20test2','top20buy']
INFER = ['top20']

In [15]:
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_aids(df):
    aids=df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    if 0:
        df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
        aids2 = df2.aid.tolist()
        unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session

        mx = df.d.max()
        aids2 = df.loc[df.d==mx].aid.tolist()
        unique_aids4 = list(dict.fromkeys(aids2[::-1] ))

        df = df.loc[ df['type'].isin([1,2]) ]
        unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    
    ln = len(unique_aids)
 
    if 0: #len(unique_aids)>=20:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        if 'top20c' in INFER:
            aids3 = list(itertools.chain(*[top_20c[aid][:20] for aid in unique_aids[:2] if aid in top_20c]))
            for i,aid in enumerate(aids3):
                aids_temp[aid] += 0.6
        if 'top20b' in INFER:
            aids3 = list(itertools.chain(*[top_20b[aid][:15] for aid in unique_aids3 if aid in top_20b]))
            for i,aid in enumerate(aids3):
                aids_temp[aid] += 0.3
        if 'top20test2' in INFER:
            aids3 = list(itertools.chain(*[top_20_test2[aid][:20] for aid in unique_aids[:2] if aid in top_20_test2]))
            for i,aid in enumerate(aids3):
                aids_temp[aid] += 0.6
                
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    aids_temp = Counter() 
    if 'top20' in INFER:
        aids2 = list(itertools.chain(*[top_20[aid][:20] for aid in unique_aids if aid in top_20]))
        for i,aid in enumerate(aids2):
            m = 0.1 + 0.9*(ln-(i//20))/ln
            aids_temp[aid] += m
            if i%20==0: aids_temp[aid] += m
    if 'top20b' in INFER:
        aids3 = list(itertools.chain(*[top_20b[aid][:20] for aid in unique_aids[:2] if aid in top_20b]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 1
            if i%20==0: aids_temp[aid] += 1
    if 'top20test2' in INFER:
        aids3 = list(itertools.chain(*[top_20_test2[aid][:20] for aid in unique_aids[:2] if aid in top_20_test2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 1
            if i%20==0: aids_temp[aid] += 1
    if 'top20f' in INFER:
        aids4 = list(itertools.chain(*[top_20f[aid][:10] for aid in unique_aids4 if aid in top_20f]))
        for i,aid in enumerate(aids4):
            w = i//10
            aids_temp[aid] += 1 -w*0.1
            if i%10==0: aids_temp[aid] += 1 -w*0.1
    if 'top20e' in INFER:       
        aids5 = list(itertools.chain(*[top_20e[aid][:20] for aid in unique_aids3 if aid in top_20e]))
        for i,aid in enumerate(aids5):
            aids_temp[aid] += 1
            if i%20==0: aids_temp[aid] += 1
        top_aids2 = [k for k,v in aids_temp.most_common(1) if k not in unique_aids]
    if 'top20c' in INFER:
        aids3 = list(itertools.chain(*[top_20c[aid][:10] for aid in top_aids2 if aid in top_20c]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 1
            if i%10==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + top_aids2[:ITEM_CT - len(unique_aids)]
    return (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]

def suggest_orders(df):
    aids = df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    if 0:
        mx = df.d.max()
        aids2 = df.loc[df.d==mx].aid.tolist()
        unique_aids4 = list(dict.fromkeys(aids2[::-1] ))

        mx = df.ts.max()
        aids2 = df.loc[df.ts>=mx - 60*60/2].aid.tolist()
        unique_aids5 = list(dict.fromkeys(aids2[::-1] )) #recent 1 hour   

        df2 = df.drop_duplicates('d')
        aids2 = df2.aid.tolist()
        unique_aids2 = list(dict.fromkeys(aids2[::-1] )) #first of each session

        df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
        aids2 = df2.aid.tolist()
        unique_aids3 = list(dict.fromkeys(aids2 )) #last of each session

        df = df.loc[ df['type'].isin([1,2]) ]
        unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    
    if 0: #len(unique_aids)>=20:

        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        for aid in unique_aids2: 
            aids_temp[aid] += 0.5
        for aid in unique_aids3: 
            aids_temp[aid] += 0.5
        if 'top20buy2buy' in INFER:
            aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:40] for aid in unique_buys if aid in top_20_buy2buy]))
            for i,aid in enumerate(aids3):
                aids_temp[aid] += 0.05
                if i%40==0: aids_temp[aid] += 0.05
        if 'top20buy2buy2' in INFER:
            aids3 = list(itertools.chain(*[top_20_buy2buy2[aid][:40] for aid in unique_buys if aid in top_20_buy2buy2]))
            for i,aid in enumerate(aids3):
                aids_temp[aid] += 0.1
                if i%40==0: aids_temp[aid] += 0.1
        if 'top20test' in INFER:        
            aids4 = list(itertools.chain(*[top_20_test[aid][:40] for aid in unique_aids if aid in top_20_test]))
            for i,aid in enumerate(aids4):
                aids_temp[aid] += 0.05
                if i%40==0: aids_temp[aid] += 0.05
        if 'top20c' in INFER:
            aids5 = list(itertools.chain(*[top_20c[aid][:20] for aid in unique_aids[:1] if aid in top_20c]))
            for i,aid in enumerate(aids5):
                aids_temp[aid] += 0.05
                if i%20==0: aids_temp[aid] += 0.05
        if 'top20d' in INFER:
            aids6 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
            for i,aid in enumerate(aids6):
                aids_temp[aid] += 0.05
                if i%20==0: aids_temp[aid] += 0.05
        if 'top20b' in INFER:        
            aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids3 if aid in top_20b]))
            for i,aid in enumerate(aids7):
                aids_temp[aid] += 0.25
                if i%5==0: aids_temp[aid] += 0.25
            aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids2 if aid in top_20b]))
            for i,aid in enumerate(aids7):
                aids_temp[aid] += 0.125
                if i%5==0: aids_temp[aid] += 0.125
            
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    weights = [2,2] + [1]*8 #+ [0]*30
    weights2 = [2,2] + [1]*53 #+ [0]*25
    weights3 = [2,2] + [1]*18 #+ [0]*70
    weights4 = [2,2] + [1]*38 #+ [0]*70
    
    ln = len(unique_aids)
    
    aids_temp = Counter() 
    if 'top20orders' in INFER:
        aids2 = list(itertools.chain(*[top_20_orders[aid][:10] for aid in unique_aids if aid in top_20_orders]))
        w2 = weights* int(len(aids2)//10)
        for i,(aid,w) in enumerate(zip(aids2,w2)):
            m = 0.25 + 0.75*(ln-(i//10))/ln
            aids_temp[aid] += w*m
    if 'top20buy2buy' in INFER:
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:10] for aid in unique_buys if aid in top_20_buy2buy]))
        w3 = weights* int(len(aids3)//10)
        for i,(aid,w) in enumerate(zip(aids3,w3)):
            aids_temp[aid] += w/2
    if 'top20test' in INFER:
        aids4 = list(itertools.chain(*[top_20_test[aid][:10] for aid in unique_aids if aid in top_20_test]))
        w4 = weights* int(len(aids4)//10)
        for i,(aid,w) in enumerate(zip(aids4,w4)):
            m = 0.25 + 0.75*(ln-(i//10))/ln
            aids_temp[aid] += w*m
    if 'top20buy2buy2' in INFER:
        aids5 = list(itertools.chain(*[top_20_buy2buy2[aid][:10] for aid in unique_buys if aid in top_20_buy2buy2]))
        w5 = weights* int(len(aids5)//10)
        for i,(aid,w) in enumerate(zip(aids5,w5)):
            aids_temp[aid] += w/2
    if 'top20c' in INFER:
        aids5 = list(itertools.chain(*[top_20c[aid][:55] for aid in unique_aids[:1] if aid in top_20c]))
        w5 = weights2* int(len(aids5)//55)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
    if 'top20d' in INFER:
        aids5 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
        
    # NEW
    if 'top20buy' in INFER:
        ln2 = len(unique_aids5)
        aids5 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in unique_aids5 if aid in top_20_buy]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += 2*w/ln2
    if 'top20f' in INFER:   
        aids4 = list(itertools.chain(*[top_20f[aid][:5] for aid in unique_aids4 if aid in top_20f]))
        for i,aid in enumerate(aids4):
            w = i//5
            aids_temp[aid] += 1/2 -w*0.05
            if i%5==0: aids_temp[aid] += 1/2 -w*0.05
    if 'top20e' in INFER:
        aids5 = list(itertools.chain(*[top_20e[aid][:55] for aid in unique_aids3 if aid in top_20e]))
        w5 = weights2* int(len(aids5)//55)
        for i,(aid,w) in enumerate(zip(aids5,w5)):
            w2 = i//55
            aids_temp[aid] += w -w2*0.1
    if 'top20e' in INFER:
        aids5 = list(itertools.chain(*[top_20e[aid][:10] for aid in unique_aids2 if aid in top_20e]))
        w5 = weights* int(len(aids5)//10)
        for i,(aid,w) in enumerate(zip(aids5,w5)):
            w2 = i//10
            aids_temp[aid] += w/2. -w2*0.05
                    
    sorted_aids = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + sorted_aids[:ITEM_CT - len(unique_aids)]
    return (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]

In [16]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=4, use_memory_fs=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [17]:
#%%time
#test_df = test_df.sort_values(["session", "ts"])
#test_df['d'] = test_df.groupby('session').ts.diff()
#test_df.d = (test_df.d > 60*60*2).astype('int16').fillna(0)
#test_df.d = test_df.groupby('session').d.cumsum()

In [18]:
%%time
pred_df = test_df.groupby(["session"]).parallel_apply(
    lambda x: suggest_aids(x)
)
#pred_df_orders = pred_df

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=450313), Label(value='0 / 450313')…

CPU times: user 3min 17s, sys: 26.3 s, total: 3min 43s
Wall time: 7min 23s


In [19]:
#%%time
#pred_df_orders = test_df.groupby(["session"]).parallel_apply(
#    lambda x: suggest_orders(x)
#)
#pred_df = pred_df_orders

In [20]:
#%%time
#pred_df_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
#    lambda x: suggest_carts(x)
#)

In [21]:
pred_df = pd.DataFrame(pred_df.add_suffix("_clicks"), columns=["labels"]).reset_index()
#pred_df = pd.DataFrame(pred_df_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
#carts_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_carts"), columns=["labels"]).reset_index()

In [22]:
#pred_df_orders

In [23]:
#pred_df = pd.concat(
#    [clicks_pred_df, orders_pred_df]
#)
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv(f"../../data/candidate_scores/submission_v{VER}_v{W_VER}.csv", index=False)