# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [1]:
VER = 118

In [2]:
### import numpy as np
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np, gc
import multiprocessing
import os
import pickle

import glob
from collections import Counter

DEBUG=False   
SAMPLING = 1  # Reduce it to improve performance

In [3]:
TOP_20_CACHE = 'top_20_aids.pkl'

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("gcloud")

    with open('/tmp/json', 'w+') as f:
        f.write(secret_value_0)
        
    !gcloud auth login --cred-file /tmp/json    
    !gsutil cp gs://nesp/top_20_aids.pkl .        
        
except Exception  as ex:
    pass

## Generate AID pairs

In [4]:
import sys
def gen_pairs(df):
    df = df.query('session % @SAMPLING == 0').groupby('session', as_index=False, sort=False).apply(lambda g: g.tail(30)).reset_index(drop=True)
    df = pd.merge(df, df, on='session')
    #display( df.head() ) #abs(ts_x - ts_y) < 24 * 60 * 60 * 1000 and
    pairs = df.query('aid_x != aid_y')[['session', 'aid_x', 'aid_y','ts_x','ts_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    #display(pairs.head())
    return pairs[['aid_x', 'aid_y', 'ts_x','ts_y','type_y']].values

In [5]:
%%time

#BOOST = 'orders'
#WEIGHT = 5
   
def gen_aid_pairs(all_pairs):
    #all_pairs = defaultdict(lambda: Counter())
    with tqdm(glob.glob('../../data/infer_data/*_parquet/*'), desc='Chunks') as prog:
        with multiprocessing.Pool(20) as p:
            for idx, chunk_file in enumerate(prog):
                chunk = pd.read_parquet(chunk_file)#.drop(columns=['type'])
                pair_chunks = p.map(gen_pairs, np.array_split(chunk.head(100000000 if not DEBUG else 10000), 120))            
                for pairs in pair_chunks:
                    for aid1, aid2, ts, ts2, typ in pairs:
                        #w = 1 + 3*(ts - 1659304800025)/(1662328791563-1659304800025)
                        w = 1
                        if (typ=='carts')|(typ=='orders'): w = 4
                        m = (1/2)**(np.abs(ts2 - ts)/60/60/1000)
                        all_pairs[aid1][aid2] += w*m
                prog.set_description(f'Mem: {sys.getsizeof(object) // (2 ** 20)}MB')

                if DEBUG and idx >= 2:
                    break
                del chunk, pair_chunks
                gc.collect()
    return all_pairs
        
if os.path.exists(TOP_20_CACHE):
    print('Reading top20 AIDs from cache')
    top_20 = pickle.load(open(TOP_20_CACHE, 'rb'))
else:
    all_pairs = defaultdict(lambda: Counter())
    all_pairs = gen_aid_pairs(all_pairs)
    
    #df_top_20 = []
    #for aid, cnt in tqdm(all_pairs.items()):
    #    df_top_20.append({'aid1': aid, 'aid2': [aid2 for aid2, freq in cnt.most_common(20)]})

    #df_top_20 = pd.DataFrame(df_top_20).set_index('aid1')
    #top_20 = df_top_20.aid2.to_dict()
    import pickle
    #with open(f'top_20_{BOOST}_v{VER}.pkl', 'wb') as f:
    #    pickle.dump(top_20, f)
        
#len(top_20)

Chunks:   0%|          | 0/146 [00:00<?, ?it/s]

CPU times: user 2h 36min 34s, sys: 1min 19s, total: 2h 37min 53s
Wall time: 2h 38min 36s


In [6]:
%%time
df_top_40 = []
for aid, cnt in tqdm(all_pairs.items()):
    df_top_40.append({'aid1': aid, 'aid2': [aid2 for aid2, freq in cnt.most_common(40)]})

df_top_40 = pd.DataFrame(df_top_40).set_index('aid1')
df_top_40.aid2 = df_top_40.aid2.astype('int32') 
top_40 = df_top_40.aid2.to_dict()
with open(f'../../data/covisit_matrices/top_40_orders_carts_v{VER}.pkl', 'wb') as f:
    pickle.dump(top_40, f)

  0%|          | 0/1842637 [00:00<?, ?it/s]

CPU times: user 8min 2s, sys: 3.14 s, total: 8min 5s
Wall time: 8min 3s
