# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [1]:
VER = 709

OFFSET = ''

ITEM_CT = 100
ITEM_CT2 = 100
ITEM_CT3 = 10

In [2]:
### import numpy as np
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np, gc
import multiprocessing
import os
import pickle

import glob
from collections import Counter

## Test set inference

In [3]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(tqdm(glob.glob('/raid/Kaggle/otto/valid/test_parquet/*'))):
        chunk = pd.read_parquet(chunk_file)
        #chunk.ts *= 1000
        dfs.append(chunk)

    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

In [4]:
#%%time
#test_df = load_test()

#test_df = test_df.sort_values(["session", "ts"])
#test_df['d'] = test_df.groupby('session').ts.diff()
#test_df.d = (test_df.d > 60*60*2).astype('int16').fillna(0)
#test_df.d = test_df.groupby('session').d.cumsum()

#test_df.to_parquet('../../Nov-2022/nov-12-22-otto/test.parquet',index=False)

In [5]:
test_df = pd.read_parquet('test_with_d.parquet')

In [6]:
#test_df = test_df.iloc[:4096]

In [7]:
#MN = 1661119200
MN = test_df.ts.min()
MN

1661119200

In [8]:
test_df['day'] = (test_df.ts - MN) // (60*60*24)

In [9]:
test_df.head()

Unnamed: 0,session,aid,ts,type,d,day
0,11098528,11830,1661119200,0,0,0
1,11098529,1105029,1661119200,0,0,0
2,11098530,264500,1661119200,0,0,0
3,11098530,264500,1661119288,0,0,0
4,11098530,409236,1661119369,0,0,0


In [10]:
# COMPUTED FROM TEST DATA
top_orders = [ 876493, 1406660, 1236775,  166037, 1460571, 1531805,  836852,
        634452,  923948, 1043508,  832192,  801774,  258353,  332654,
       1596897,   29735,  544144, 1581568, 1006198, 1462420,  331708,
        231487,  642804,  986164, 1116095,  329725, 1022566, 1604220,
       1733943,  289406,  631899,  162064,  122983,  247240, 1436280,
        954951, 1445562,  373490, 1568011,  172856,  756588,  184976,
       1534690,  508883, 1310373,  530377, 1457846, 1196256,  321547,
        982423,  892871, 1441266, 1294924, 1685214, 1609228,  576949,
        714524,   56279, 1102089,  314210, 1125638, 1768724, 1734305,
        898836, 1188425,  145332,  688602,  527209, 1647157,   42628,
       1628069, 1722991,  409620,  102416, 1476166, 1734475, 1182614,
        926412,  988767,  137514,  500609,  479970,  165160,  443425,
        302096, 1257293,  823143,  856779, 1030009,  304807, 1164832,
         78262, 1025795, 1133209,  881286, 1620020,  742601,  108125,
        558573, 1107897]
top_carts = [ 485256,   33343,  613493,  152547,  876493, 1406660, 1736857,
        166037, 1460571, 1236775,  554660,  660655, 1531805, 1022566,
        231487,  122983, 1562705,  923948,  332654,  544144,  832192,
        634452,  322370,  836852,   29735, 1006198, 1043508,  801774,
       1462420,  258353,  530377,  558573, 1116095,  756588,  986164,
        329725, 1568011, 1581568, 1025795, 1445562,   42628, 1733943,
        642804, 1604220,  184976,  892871,  162064,  501077, 1182614,
       1647157,  331708, 1310373, 1534690, 1111967, 1476166, 1257293,
         19665, 1768724,  108125, 1502122,  172856,  982423, 1734475,
        409620, 1586171, 1685214, 1662401,  881286, 1164832,  373490,
        637827, 1125638, 1702657, 1281615,  714524,  500609,  508883,
         78262, 1359971, 1102089, 1734305, 1628069, 1436280, 1596897,
        631899, 1030009, 1497089,  988767,  670066,  289406,  450505,
       1196256,  247240,   16151, 1497245, 1689044,  479970, 1609228,
       1853288, 1636724]
top_clicks = [ 485256, 1460571,  108125, 1551213,   33343,  613493,  876493,
        152547,  184976, 1236775, 1406660,  836852,  331708, 1531805,
         29735,  554660,  634452,  986164,  959208,  832192,  332654,
        620545, 1645990,  166037,  102345, 1116095,  811371,  322370,
       1733943, 1043508, 1019736,  756588,  801774,  231487,  861401,
       1502122, 1030009,  329725, 1624436, 1022566, 1006198,  496180,
       1685214, 1853288,  530377,  659399, 1604220, 1102089, 1586171,
       1497089,  754412, 1765072, 1109824, 1264313,  881286, 1294924,
        199409, 1581568,  385065,  195381,  558573, 1196256, 1782099,
        612920,  435253,  923948,  508883, 1126038, 1647563, 1508062,
       1182614,  544144,  247240,  982423,   57315, 1628069,  674590,
       1798916,  892871,  909449,  337471,  721034, 1734305,   95488,
       1722991, 1754984,  819288, 1462420,  836707, 1796103,   42628,
        642804,  122983,  321547, 1278671,  714524, 1603001,  409620,
        636101, 1702657]

In [11]:
print('Reading top20 AIDs from cache')
top_20 = pickle.load(open('../../data/covisit_matrices/top_80_aids_v93.pkl', 'rb')) #time weight. All2all #added time decay

top_20b = pickle.load(open('../../data/covisit_matrices/top_40_aids_v217_0.pkl', 'rb')) # both ways 1 #td did not help
top_20c = pickle.load(open('../../data/covisit_matrices/top_40_aids_v220_0.pkl', 'rb')) # both ways 2 #td did not help
top_20d = pickle.load(open('../../data/covisit_matrices/top_40_aids_v226_0.pkl', 'rb')) # forward 3 buy #td did not help

top_20e = pickle.load(open('../../data/covisit_matrices/top_40_aids_v232_0.pkl', 'rb')) # both ways 3 #added time decay

top_20f = pickle.load(open('../../data/covisit_matrices/top_40_aids_v235_0.pkl', 'rb')) # both ways 6 #time decay
top_20f.update(pickle.load(open('../../data/covisit_matrices/top_40_aids_v235_1.pkl', 'rb'))) # both ways 6 #time decay

top_20_orders = pickle.load(open('../../data/covisit_matrices/top_40_orders_carts_v95.pkl', 'rb')) #added time decay
top_20_carts = top_20_orders

top_20_buy2buy = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v90.pkl', 'rb')) #td did not help
top_20_buy2buy2 = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v99.pkl', 'rb')) #time decay

top_20_test = pickle.load(open('../../data/covisit_matrices/top_40_aids_v116_0.pkl', 'rb')) #time decay
for k in range(1,3): # recent 3 week time delta decay. all to buy
    top_20_test.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v116_{k}.pkl', 'rb')) )
    
top_20_test2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v115_0.pkl', 'rb')) #time decay
for k in range(1,3): # recent 2 week time delta decay weight
    top_20_test2.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v115_{k}.pkl', 'rb')) )
    
top_20_buy = pickle.load(open('../../data/covisit_matrices/top_40_aids_v239_0.pkl', 'rb'))

Reading top20 AIDs from cache
CPU times: user 41.4 s, sys: 6.23 s, total: 47.7 s
Wall time: 47.6 s


In [14]:
top_20_new = pickle.load(open('../../data/covisit_matrices/top_40_aids_v700_0.pkl', 'rb'))
top_20_new2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v701_0.pkl', 'rb'))

In [15]:
%%time
top_40_day = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v155_d0_0.pkl', 'rb')) 
top_40_day2 = pickle.load(open(f'../../data/covisit_matrices/top_80_aids_v157_d0_0.pkl', 'rb'))

CPU times: user 13.6 s, sys: 467 ms, total: 14.1 s
Wall time: 14.1 s


In [16]:
%%time
top_40_less = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v167_0.pkl', 'rb')) 
top_40_more = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v168_0.pkl', 'rb'))

CPU times: user 4.01 s, sys: 1.34 s, total: 5.35 s
Wall time: 5.35 s


In [17]:
%%time
top_40_less2 = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v165_0.pkl', 'rb')) 
top_40_more2 = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v166_0.pkl', 'rb'))

CPU times: user 16.8 s, sys: 858 ms, total: 17.6 s
Wall time: 17.6 s


In [18]:
len( top_20), len( top_20_orders ), len( top_20_buy2buy )

(1812132, 1819763, 1072368)

In [19]:
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_aids(df):
    
    #aids=df.aid.tolist()
    #types = df.type.tolist()
    
    session = df[0]
    aids = df[1]
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds2 = df[6]
    #days = df[7]
    
    top_day = top_40_day2
    click_aids = click_df[session][:ITEM_CT3]
    
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    #df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session
    unique_aids3 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds2[i] == 1][::-1] ))
    
    #mx = df.d.max()
    #aids2 = df.loc[df.d==mx].aid.tolist()
    #unique_aids4 = list(dict.fromkeys(aids2[::-1] ))
    mx = np.max(ds)
    unique_aids4 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds[i] == mx][::-1] ))
    
    #aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    #unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day 
    mx = np.max(tss)
    unique_aids6 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60*24 ][::-1] ))
    
    #df = df.loc[ df['type'].isin([1,2]) ]
    #unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    ln = len(unique_aids)
 
    if len(unique_aids)>=15:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        aids3 = list(itertools.chain(*[top_20c[aid][:20*2] for aid in unique_aids[:2] if aid in top_20c]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
        aids3 = list(itertools.chain(*[top_20b[aid][:15*2] for aid in unique_aids3 if aid in top_20b]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.3
        aids3 = list(itertools.chain(*[top_20_test2[aid][:20*2] for aid in unique_aids[:2] if aid in top_20_test2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
            
        #aids3 = list(itertools.chain(*[top_20[aid][:10] for aid in click_aids[:5] if aid in top_20]))
        #for i,aid in enumerate(aids3):
        #    aids_temp[aid] += 0.3
                
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return session, (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    aids_temp = Counter() 
    
    # NEW
    MM = 4
    aids2 = list(itertools.chain(*[top_day[aid][:10*MM] for aid in unique_aids6 if aid in top_day]))     
    for i,aid in enumerate( aids2 ):
        aids_temp[aid] += 1  
        
    # NEW NEW
    #ln0 = len(click_aids)
    aids2 = list(itertools.chain(*[top_20[aid][:20] for aid in click_aids if aid in top_20]))
    for i,aid in enumerate(aids2):
        aids_temp[aid] += 0.5
        #if i%20==0: aids_temp[aid] += 0.5 
    
    weights3 = [2,2] + [1]*28 
    if len(unique_aids)==1:
        aids5 = list(itertools.chain(*[top_20_new2[aid][:30] for aid in unique_aids[-1:] if aid in top_20_new2]))
        w5 = weights3* int(len(aids5)//30)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
            
    #aids2 = list(itertools.chain(*[top_20[aid][:20*2] for aid in unique_aids if aid in top_20]))
    #for i,aid in enumerate(aids2):
    #    m = 0.1 + 0.9*(ln-(i//(20*2)))/ln
    #    aids_temp[aid] += m
    #    if i%(20*2)==0: aids_temp[aid] += m
         
    # FROM GIBA
    for i, a in enumerate(unique_aids):
        w0 = np.max([1 - (0.35 * i), 0.001]) #Weight aid order starting from the last one. 
        if a in top_20:
            for j, aj in enumerate(top_20[a]):
                w1 = np.max([1 - (0.005 * j), 0.01]) #Weight the candidate aid from the dict
                aids_temp[aj] += (w0*w1)
                            
    aids3 = list(itertools.chain(*[top_20b[aid][:20*2] for aid in unique_aids[:2] if aid in top_20b]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
            
    aids3 = list(itertools.chain(*[top_20_test2[aid][:20*2] for aid in unique_aids[:2] if aid in top_20_test2]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
            
    # TRY GIBA HERE
    aids4 = list(itertools.chain(*[top_20f[aid][:10*2] for aid in unique_aids4 if aid in top_20f]))
    for i,aid in enumerate(aids4):
        w = i//(10*2)
        aids_temp[aid] += 1 -w*0.1
        if i%(10*2)==0: aids_temp[aid] += 1 -w*0.1
            
    aids5 = list(itertools.chain(*[top_20e[aid][:20*2] for aid in unique_aids3 if aid in top_20e]))
    for i,aid in enumerate(aids5):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(1) if k not in unique_aids]
    
    aids3 = list(itertools.chain(*[top_20c[aid][:10*2] for aid in top_aids2 if aid in top_20c]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(10*2)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + top_aids2[:ITEM_CT - len(unique_aids)]
    return session, (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]

def suggest_orders(df):
    
    session = df[0]
    aids = df[1]
    aids9 = aids.copy()
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds1 = df[5]
    ds2 = df[6]
    days = df[7]
    
    #top_day = top_40_day[ df.day.values[0] ]
    top_day = top_40_day
    #click_aids = click_df[df.session.values[0]][:ITEM_CT2]
    click_aids = click_df[session][:ITEM_CT3]
    
    #aids = df.aid.tolist()
    #types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    #mx = df.d.max()
    #aids2 = df.loc[df.d==mx].aid.tolist()
    #unique_aids4 = list(dict.fromkeys(aids2[::-1] )) # last session
    mx = np.max(ds)
    unique_aids4 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds[i] == mx][::-1] ))
    
    #mx = df.ts.max()
    #aids2 = df.loc[df.ts >= mx - 60*60/2].aid.tolist()
    #unique_aids5 = list(dict.fromkeys(aids2[::-1] )) #recent 1 hour  
    mx = np.max(tss)
    unique_aids5 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60/2 ][::-1] ))
    
    #aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    #unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day 
    unique_aids6 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60*24 ][::-1] ))
    
    #df2 = df.drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids2 = list(dict.fromkeys(aids2[::-1] )) #first of each session
    unique_aids2 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds1[i] == 1][::-1] ))
    
    #df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids3 = list(dict.fromkeys(aids2 )) #last of each session
    unique_aids3 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds2[i] == 1][::-1] ))
    
    #df = df.loc[ df['type'].isin([1,2]) ]
    #unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    if len(unique_aids)>=20:

        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        for aid in unique_aids2: 
            aids_temp[aid] += 0.5
        for aid in unique_aids3: 
            aids_temp[aid] += 0.5
            
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:40] for aid in unique_buys if aid in top_20_buy2buy]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids3 = list(itertools.chain(*[top_20_buy2buy2[aid][:40] for aid in unique_buys if aid in top_20_buy2buy2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.1
            if i%40==0: aids_temp[aid] += 0.1
                
        aids4 = list(itertools.chain(*[top_20_test[aid][:40] for aid in unique_aids if aid in top_20_test]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids5 = list(itertools.chain(*[top_20c[aid][:20] for aid in unique_aids[:1] if aid in top_20c]))
        for i,aid in enumerate(aids5):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        aids6 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
        for i,aid in enumerate(aids6):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
                
        aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids3 if aid in top_20b]))
        for i,aid in enumerate(aids7):
            aids_temp[aid] += 0.25
            if i%5==0: aids_temp[aid] += 0.25
        aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids2 if aid in top_20b]))
        for i,aid in enumerate(aids7):
            aids_temp[aid] += 0.125
            if i%5==0: aids_temp[aid] += 0.125
                
           
        aids4 = list(itertools.chain(*[top_day[aid][:40] for aid in unique_aids6 if aid in top_day]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids4 = list(itertools.chain(*[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        aids4 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        for aid in click_aids:
            aids_temp[aid] += 0.05
            
        # NEW STUFF 
        aids4 = list(itertools.chain(*[top_40_more[aid][:5] for aid in unique_aids if aid in top_40_more]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_more2[aid][:5] for aid in unique_aids if aid in top_40_more2]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_less[aid][:5] for aid in unique_aids if aid in top_40_less]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_less2[aid][:5] for aid in unique_aids if aid in top_40_less2]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
            
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return session, (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    weights = [2,2] + [1]*8 #+ [0]*30
    weights2 = [2,2] + [1]*53 #+ [0]*25
    weights3 = [2,2] + [1]*18 #+ [0]*70
    weights4 = [2,2] + [1]*38 #+ [0]*70
    weights5 = [2,2] + [1]*28 #+ [0]*70
    
    ln = len(unique_aids)
    
    MM = 3
    aids_temp = Counter() 
    aids2 = list(itertools.chain(*[top_20_orders[aid][:10*MM] for aid in unique_aids if aid in top_20_orders]))
    w2 = weights5* int(len(aids2)//(10*MM))
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:10*MM] for aid in unique_buys if aid in top_20_buy2buy]))
    w3 = weights5* int(len(aids3)//(10*MM))
    aids4 = list(itertools.chain(*[top_20_test[aid][:10*MM] for aid in unique_aids if aid in top_20_test]))
    w4 = weights5* int(len(aids4)//(10*MM))
    aids5 = list(itertools.chain(*[top_20_buy2buy2[aid][:10*MM] for aid in unique_buys if aid in top_20_buy2buy2]))
    w5 = weights5* int(len(aids5)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    for i,(aid,w) in enumerate(zip(aids3,w3)):
        aids_temp[aid] += w/2
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        aids_temp[aid] += w/2
      
    # NEW
    MM = 1
    top_40_use = top_40_more
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    MM = 1
    top_40_use = top_40_less
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
        
    # NEW
    MM = 1
    top_40_use = top_40_more2
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    MM = 1
    top_40_use = top_40_less2
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
                      
    MM = 4
    aids2 = list(itertools.chain(*[top_day[aid][:10*MM] for aid in unique_aids6 if aid in top_day]))
    w2 = weights4* int(len(aids2)//(10*MM))        
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += 1 #w*m     
        
    ln0 = len(click_aids)
    aids4 = list(itertools.chain(*[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]))
    w4 = weights3* int(len(aids4)//(20))
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln0-(i//(20)))/ln0
        aids_temp[aid] += w*m
    aids4 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]))
    w4 = weights3* int(len(aids4)//(20))
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln0-(i//(20)))/ln0
        aids_temp[aid] += w*m
    for aid in click_aids:
        aids_temp[aid] += 1
        
    aids5 = list(itertools.chain(*[top_20c[aid][:55] for aid in unique_aids[:1] if aid in top_20c]))
    w5 = weights2* int(len(aids5)//55)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += w
        
    if len(unique_aids)==1:
        aids5 = list(itertools.chain(*[top_20_new2[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new2]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
        aids5 = list(itertools.chain(*[top_20_new[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
        
    aids5 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
    w5 = weights3* int(len(aids5)//20)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += w
        
    ln2 = len(unique_aids5)
    aids5 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in unique_aids5 if aid in top_20_buy]))
    w5 = weights3* int(len(aids5)//20)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += 2*w/ln2
        
    aids4 = list(itertools.chain(*[top_20f[aid][:5] for aid in unique_aids4 if aid in top_20f]))
    for i,aid in enumerate(aids4):
        w = i//5
        aids_temp[aid] += 1/2 -w*0.05
        if i%5==0: aids_temp[aid] += 1/2 -w*0.05
    aids5 = list(itertools.chain(*[top_20e[aid][:55] for aid in unique_aids3 if aid in top_20e]))
    w5 = weights2* int(len(aids5)//55)
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        w2 = i//55
        aids_temp[aid] += w -w2*0.1
    aids5 = list(itertools.chain(*[top_20e[aid][:10] for aid in unique_aids2 if aid in top_20e]))
    w5 = weights* int(len(aids5)//10)
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        w2 = i//10
        aids_temp[aid] += w/2. -w2*0.05
                            
    sorted_aids = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + sorted_aids[:ITEM_CT - len(unique_aids)]
    return session, (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]

In [20]:
import psutil
N_CORES = psutil.cpu_count()     
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 40


In [21]:
N_CORES = 20
def df_parallelize_run(func, t_split):
    
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [22]:
%%time
PIECES = 10
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'../../data/train_data/lists/valid_group_tolist_{PART}_1.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

1801251
CPU times: user 36.8 s, sys: 574 ms, total: 37.3 s
Wall time: 37.3 s


In [23]:
import pickle
click_df = pickle.load(open('../../data/candidate_scores/clicks_v620_F.pkl', 'rb'))

In [24]:
%%time
temp = df_parallelize_run(suggest_aids, valid_bysession_list)
val_clicks = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 24.1 s, sys: 14.4 s, total: 38.5 s
Wall time: 3min 3s


In [25]:
%%time
temp = df_parallelize_run(suggest_orders, valid_bysession_list)
val_buys = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 1min 6s, sys: 18.8 s, total: 1min 25s
Wall time: 4min 31s


In [26]:
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}

valid_labels = pd.read_parquet('../../data/train_data/test_labels.parquet')

def hits(b):
    # b[0] : session id
    # b[1] : ground truth
    # b[2] : aids prediction 
    return b[0], len(set(b[1]).intersection(set(b[2]))), np.clip(len(b[1]), 0, 20)

def otto_metric_piece(values, typ, verbose=True):
    
    c1 = pd.DataFrame(values, columns=["labels"]).reset_index().rename({"index":"session"}, axis=1)
    a = valid_labels.loc[valid_labels['type']==typ].merge(c1, how='left', on=['session'])
    
    a = a.loc[a.labels.notna()]

    b=[[a0, a1, a2] for a0, a1, a2 in zip(a["session"], a["ground_truth"], a["labels"])]
    c = df_parallelize_run(hits, b)
    c = np.array(c)
    
    recall = c[:,1].sum() / c[:,2].sum()
    
    print('{} recall = {:.5f}'.format(typ ,recall))
    
    return recall

def otto_metric(clicks, carts, orders, verbose = True):
    
    score = 0
    score += weights["clicks"] * otto_metric_piece(clicks, "clicks", verbose = verbose)
    score += weights["carts"] * otto_metric_piece(carts, "carts", verbose = verbose)
    score += weights["orders"] * otto_metric_piece(orders, "orders", verbose = verbose)
    
    if verbose:
        print('=============')
        print('Overall Recall = {:.5f}'.format(score))
        print('=============')
    
    return score

In [27]:
#%%time
_ = otto_metric_piece(val_clicks, 'clicks')

#clicks recall = 0.62412 v701

#### recall @100
# clicks recall = 0.67459

clicks recall = 0.67459


In [28]:
%%time
_ = otto_metric_piece(val_buys, 'carts')
_ = otto_metric_piece(val_buys, 'orders')

#carts recall = 0.48626 v708_0
#orders recall = 0.69640

##### recall @100
#carts recall = 0.52667
#orders recall = 0.71981

carts recall = 0.52667
orders recall = 0.71981
CPU times: user 57.6 s, sys: 32.4 s, total: 1min 29s
Wall time: 1min 32s


In [32]:
0.1*0.67459 + 0.3*0.52667 + 0.6*0.71981

0.657346

# Create Submission

In [29]:
pred_df = val_clicks
pred_df_orders = val_buys

In [30]:
clicks_pred_df = pd.DataFrame(pred_df.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_carts"), columns=["labels"]).reset_index()

In [31]:
%%time
pred_df = pd.concat(
    [clicks_pred_df, orders_pred_df, carts_pred_df]
)
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv(f"../../data/candidate_scores/submission_v{VER}{OFFSET}.csv", index=False)

CPU times: user 3min 48s, sys: 5.65 s, total: 3min 54s
Wall time: 3min 55s
