# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [1]:
VER = '620'
POSTFIX = '_LB'

OFFSET = ''

ITEM_CT2 = 50
ITEM_CT = 50

In [2]:
### import numpy as np
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np, gc
import multiprocessing
import os
import pickle

import glob
from collections import Counter

## Test set inference

In [8]:
# COMPUTED FROM TEST DATA
top_orders = [ 986164, 1460571,  329725, 1043508,  332654,  688602,   29735,
       1495817,  579690, 1022566, 1006198,  471073,  832192,  544144,
       1825743,  836852,  147526, 1236775,  166037, 1030009, 1609228,
        508883,  923948, 1462420,  892871,  554660, 1457846,  258353,
       1734475,  480314,  145332,  108125, 1286213, 1336175, 1359971,
        137514,  714524,  558573,  172856,  585186,  352192, 1176975,
       1146575,  954951, 1496287,  823143, 1699089,   25964, 1257293,
        399315, 1441266, 1196256, 1294924, 1603001, 1274545,  414968,
       1581568,  247240, 1116095,  383437,  530377,  272744, 1445562,
        269257,  791627, 1140985, 1708326,  631899,  670066,  122983,
        223273,  165160,  881286, 1768724,  868327, 1604220,  406358,
       1722991, 1568011, 1025795, 1647563,  835431, 1531805,  714968,
        500609, 1217083, 1668343, 1159757, 1610239, 1647157, 1264313,
       1798916,  423558,  752652,  184976, 1255910, 1413049,  801774,
        615566, 1034578]
top_carts = [ 485256,   33343, 1460571,  986164,  554660,  660655, 1116095,
        152547, 1022566,  544144,  832192,  579690,  329725, 1043508,
       1006198,  558573,  471073,  332654,  688602,   29735,  508883,
        258353, 1736857, 1462420,  166037, 1609228, 1778843,  108125,
       1495817, 1604220, 1825743, 1562705,  147526,  836852, 1286213,
         25964, 1236775,  923948, 1281615, 1257293,  917587,  835431,
       1439409,  892871,  125957,  122983, 1097061, 1449873, 1568011,
       1030727, 1146575, 1731920,  326904, 1196256,  714524, 1768724,
        480314, 1800674, 1662401, 1359971,  455191,  496180,  145332,
        616283, 1708326, 1294924, 1270528,  944778, 1223508,  881286,
        165160,  272744,  670066,  868327, 1734475,  137514,  172856,
       1122221,  442293, 1685214,  823143, 1413049, 1722991, 1647157,
        406358, 1733943,  700995, 1025795,  754412,  530377,  102416,
        184976, 1445562, 1565495, 1019736, 1274545, 1083665,  667563,
       1264313,  563117]
top_clicks = [1460571,  485256,  108125,  986164, 1551213,  754412,  554660,
        832192,  579690,   33343, 1006198,  688602,   29735,  329725,
        184976, 1019736,  496180,  861401,  944778,  659399, 1043508,
       1022566,  811371, 1604220,  836852,  471073,  819288, 1264313,
        508883, 1751274,  620545,  959208,  717965,  332654, 1731920,
        544144,  147526, 1116095, 1294924,  102345, 1645990, 1497089,
        558573,   95488, 1196256,  199409, 1110150, 1146575, 1236775,
        137514, 1030009,  435253, 1800674,  881286, 1609228, 1286213,
        337471,  670066,  831165, 1685214, 1673641,  909449, 1260564,
       1099100,  995962,  612920, 1647563, 1462420, 1741695, 1281615,
       1603001, 1722991,  442293,  206735, 1219503,  166037,  799923,
       1469891,  557072, 1156699,  111891, 1624436, 1782099, 1639229,
        530377, 1197632, 1140985,  152547,  247240, 1449873, 1825743,
        901817, 1420240, 1733943,  542343,  680375,  406358,  147278,
       1627951,  836707]

In [10]:
%%time

top_20 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v104.pkl', 'rb')) #23
for k,v in top_20.items():
    top_20[k] = [np.int32(x) for x in v[:80]]

top_20b = pickle.load(open('../../data/covisit_matrices/top_40_aids_v23.pkl', 'rb')) #23
for k,v in top_20b.items():
    top_20b[k] = [np.int32(x) for x in v[:30]]
top_20c = pickle.load(open('../../data/covisit_matrices/top_80_aids_v24.pkl', 'rb')) #55
for k,v in top_20c.items():
    top_20c[k] = [np.int32(x) for x in v[:55]]
top_20d = pickle.load(open('../../data/covisit_matrices/top_30_aids_v28.pkl', 'rb')) #20
for k,v in top_20d.items():
    top_20d[k] = [np.int32(x) for x in v[:20]]
    
top_20e = pickle.load(open('../../data/covisit_matrices/top_80_aids_v130.pkl', 'rb')) #55 #UPGRADED
for k,v in top_20e.items():
    top_20e[k] = [np.int32(x) for x in v[:55]]
    
top_20f = pickle.load(open('../../data/covisit_matrices/top_80_aids_v132.pkl', 'rb')) #10 #NEW
for k,v in top_20f.items():
    top_20f[k] = [np.int32(x) for x in v[:20]]
    
top_20_test2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v34.pkl', 'rb')) #23
for k,v in top_20_test2.items():
    top_20_test2[k] = [np.int32(x) for x in v[:40]]

CPU times: user 5min 29s, sys: 13.2 s, total: 5min 43s
Wall time: 5min 42s


In [12]:
%%time
top_20_new2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v801_0.pkl', 'rb'))
for k,v in top_20_new2.items():
    top_20_new2[k] = [np.int32(x) for x in v[:30]]

In [14]:
len( top_20)

1837262

In [15]:
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_aids(df):
    
    #aids=df.aid.tolist()
    #types = df.type.tolist()
    session = df[0]
    aids = df[1]
    types = df[2]
    ds = df[4]
    ds2 = df[6]
    
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    #df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session
    unique_aids3 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds2[i] == 1][::-1] ))
    
    #mx = df.d.max()
    #aids2 = df.loc[df.d==mx].aid.tolist()
    #unique_aids4 = list(dict.fromkeys(aids2[::-1] ))
    mx = np.max(ds)
    unique_aids4 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds[i] == mx][::-1] ))
     
    #df = df.loc[ df['type'].isin([1,2]) ]
    #unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    ln = len(unique_aids)
 
    if len(unique_aids)>=15:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        aids3 = list(itertools.chain(*[top_20c[aid][:20] for aid in unique_aids[:2] if aid in top_20c]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
        aids3 = list(itertools.chain(*[top_20b[aid][:15] for aid in unique_aids3 if aid in top_20b]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.3
        aids3 = list(itertools.chain(*[top_20_test2[aid][:20] for aid in unique_aids[:2] if aid in top_20_test2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
                
        result = [k for k,v in aids_temp.most_common(ITEM_CT2) if k not in unique_aids]
        return session,(result + top_clicks[:ITEM_CT2-len(result)])[:ITEM_CT2]
    
    aids_temp = Counter() 
    
    weights3 = [2,2] + [1]*28 
    if len(unique_aids)==1:
        aids5 = list(itertools.chain(*[top_20_new2[aid][:30] for aid in unique_aids[-1:] if aid in top_20_new2]))
        w5 = weights3* int(len(aids5)//30)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
    
    aids2 = list(itertools.chain(*[top_20[aid][:20] for aid in unique_aids if aid in top_20]))
    for i,aid in enumerate(aids2):
        m = 0.1 + 0.9*(ln-(i//(20)))/ln
        aids_temp[aid] += m
        if i%(20)==0: aids_temp[aid] += m
            
    aids3 = list(itertools.chain(*[top_20b[aid][:20] for aid in unique_aids[:2] if aid in top_20b]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20)==0: aids_temp[aid] += 1
            
    aids3 = list(itertools.chain(*[top_20_test2[aid][:20] for aid in unique_aids[:2] if aid in top_20_test2]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20)==0: aids_temp[aid] += 1
            
    aids4 = list(itertools.chain(*[top_20f[aid][:10] for aid in unique_aids4 if aid in top_20f]))
    for i,aid in enumerate(aids4):
        w = i//(10)
        aids_temp[aid] += 1 -w*0.1
        if i%(10)==0: aids_temp[aid] += 1 -w*0.1
            
    aids5 = list(itertools.chain(*[top_20e[aid][:20] for aid in unique_aids3 if aid in top_20e]))
    for i,aid in enumerate(aids5):
        aids_temp[aid] += 1
        if i%(20)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(1) if k not in unique_aids]
    
    aids3 = list(itertools.chain(*[top_20c[aid][:10] for aid in top_aids2 if aid in top_20c]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(10)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(ITEM_CT2) if k not in unique_aids]
    
    result = top_aids2
    return session,(result + top_clicks[:ITEM_CT2-len(result)])[:ITEM_CT2]

In [16]:
import psutil
N_CORES = psutil.cpu_count()     
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 40


In [17]:
N_CORES = 20
def df_parallelize_run(func, t_split):
    
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [18]:
%%time
PIECES = 10
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'../../data/infer_data/lists/test_group_tolist_{PART}_1.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

1671803
CPU times: user 47.2 s, sys: 67.8 ms, total: 47.3 s
Wall time: 47.2 s


In [19]:
%%time
temp = df_parallelize_run(suggest_aids, valid_bysession_list)
val_clicks = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 56.3 s, sys: 14.8 s, total: 1min 11s
Wall time: 1min 26s


In [20]:
%%time
click_df = val_clicks.to_dict()

CPU times: user 1.37 s, sys: 8.07 ms, total: 1.38 s
Wall time: 1.37 s


In [21]:
import pickle
with open(f'../../data/candidate_scores/clicks_v{VER}{POSTFIX}.pkl', 'wb') as f:
    pickle.dump(click_df, f)