# OTTO: Co-visitation Matrix

There exist products that are frequently viewed and bought together. Here we leverage this idea by computing a co-visitation matrix of products. It's done in the following way:

1. First we look at all pairs of events within the same session that are close to each other in time (< 1 day). We compute co-visitation matrix $M_{aid1,aid2}$ by counting global number of event pairs for each pair across all sessions.
2. For each $aid1$ we find top 20 most frequent aid2:  `aid2=argsort(M[aid])[-20:]`
3. We produce test results by concatenating `tail(20)` of test session events (see https://www.kaggle.com/code/simamumu/old-test-data-last-20-aid-get-lb0-947) with the most likely recommendations from co-visitation matrix. These recommendations are generated from session AIDs and `aid2` from the step 2


**Please, smash that thumbs up button and subscribe if you like this notebook!**

## Utils, imports

In [1]:
VER = 711
POSTFIX = '_LB'

OFFSET = ''

ITEM_CT = 100
ITEM_CT2 = 100
ITEM_CT3 = 10

In [2]:
### import numpy as np
from collections import defaultdict
import pandas as pd
from tqdm.notebook import tqdm
import glob
import numpy as np, gc
import multiprocessing
import os
import pickle

import glob
from collections import Counter

## Test set inference

In [10]:
# COMPUTED FROM TEST DATA
top_orders = [ 986164, 1460571,  329725, 1043508,  332654,  688602,   29735,
       1495817,  579690, 1022566, 1006198,  471073,  832192,  544144,
       1825743,  836852,  147526, 1236775,  166037, 1030009, 1609228,
        508883,  923948, 1462420,  892871,  554660, 1457846,  258353,
       1734475,  480314,  145332,  108125, 1286213, 1336175, 1359971,
        137514,  714524,  558573,  172856,  585186,  352192, 1176975,
       1146575,  954951, 1496287,  823143, 1699089,   25964, 1257293,
        399315, 1441266, 1196256, 1294924, 1603001, 1274545,  414968,
       1581568,  247240, 1116095,  383437,  530377,  272744, 1445562,
        269257,  791627, 1140985, 1708326,  631899,  670066,  122983,
        223273,  165160,  881286, 1768724,  868327, 1604220,  406358,
       1722991, 1568011, 1025795, 1647563,  835431, 1531805,  714968,
        500609, 1217083, 1668343, 1159757, 1610239, 1647157, 1264313,
       1798916,  423558,  752652,  184976, 1255910, 1413049,  801774,
        615566, 1034578]
top_carts = [ 485256,   33343, 1460571,  986164,  554660,  660655, 1116095,
        152547, 1022566,  544144,  832192,  579690,  329725, 1043508,
       1006198,  558573,  471073,  332654,  688602,   29735,  508883,
        258353, 1736857, 1462420,  166037, 1609228, 1778843,  108125,
       1495817, 1604220, 1825743, 1562705,  147526,  836852, 1286213,
         25964, 1236775,  923948, 1281615, 1257293,  917587,  835431,
       1439409,  892871,  125957,  122983, 1097061, 1449873, 1568011,
       1030727, 1146575, 1731920,  326904, 1196256,  714524, 1768724,
        480314, 1800674, 1662401, 1359971,  455191,  496180,  145332,
        616283, 1708326, 1294924, 1270528,  944778, 1223508,  881286,
        165160,  272744,  670066,  868327, 1734475,  137514,  172856,
       1122221,  442293, 1685214,  823143, 1413049, 1722991, 1647157,
        406358, 1733943,  700995, 1025795,  754412,  530377,  102416,
        184976, 1445562, 1565495, 1019736, 1274545, 1083665,  667563,
       1264313,  563117]
top_clicks = [1460571,  485256,  108125,  986164, 1551213,  754412,  554660,
        832192,  579690,   33343, 1006198,  688602,   29735,  329725,
        184976, 1019736,  496180,  861401,  944778,  659399, 1043508,
       1022566,  811371, 1604220,  836852,  471073,  819288, 1264313,
        508883, 1751274,  620545,  959208,  717965,  332654, 1731920,
        544144,  147526, 1116095, 1294924,  102345, 1645990, 1497089,
        558573,   95488, 1196256,  199409, 1110150, 1146575, 1236775,
        137514, 1030009,  435253, 1800674,  881286, 1609228, 1286213,
        337471,  670066,  831165, 1685214, 1673641,  909449, 1260564,
       1099100,  995962,  612920, 1647563, 1462420, 1741695, 1281615,
       1603001, 1722991,  442293,  206735, 1219503,  166037,  799923,
       1469891,  557072, 1156699,  111891, 1624436, 1782099, 1639229,
        530377, 1197632, 1140985,  152547,  247240, 1449873, 1825743,
        901817, 1420240, 1733943,  542343,  680375,  406358,  147278,
       1627951,  836707]

In [12]:
%%time

top_20 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v104.pkl', 'rb')) #23
for k,v in top_20.items():
    top_20[k] = [np.int32(x) for x in v[:80]]

top_20b = pickle.load(open('../../data/covisit_matrices/top_40_aids_v23.pkl', 'rb')) #23
for k,v in top_20b.items():
    top_20b[k] = [np.int32(x) for x in v[:30]]
top_20c = pickle.load(open('../../data/covisit_matrices/top_80_aids_v24.pkl', 'rb')) #55
for k,v in top_20c.items():
    top_20c[k] = [np.int32(x) for x in v[:55]]
top_20d = pickle.load(open('../../data/covisit_matrices/top_30_aids_v28.pkl', 'rb')) #20
for k,v in top_20d.items():
    top_20d[k] = [np.int32(x) for x in v[:20]]
    
top_20e = pickle.load(open('../../data/covisit_matrices/top_80_aids_v130.pkl', 'rb')) #55 #UPGRADED
for k,v in top_20e.items():
    top_20e[k] = [np.int32(x) for x in v[:55]]
    
top_20f = pickle.load(open('../../data/covisit_matrices/top_80_aids_v132.pkl', 'rb')) #10 #NEW
for k,v in top_20f.items():
    top_20f[k] = [np.int32(x) for x in v[:20]]
    
top_20_test2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v34.pkl', 'rb')) #23
for k,v in top_20_test2.items():
    top_20_test2[k] = [np.int32(x) for x in v[:40]]

CPU times: user 5min 30s, sys: 13.7 s, total: 5min 44s
Wall time: 5min 44s


In [13]:
%%time
top_20_test2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v34.pkl', 'rb')) #23
for k,v in top_20_test2.items():
    top_20_test2[k] = [np.int32(x) for x in v[:40]]

CPU times: user 1min 4s, sys: 1.14 s, total: 1min 5s
Wall time: 1min 5s


In [14]:
%%time
top_20_new2 = pickle.load(open('../../data/covisit_matrices/top_40_aids_v801_0.pkl', 'rb'))
for k,v in top_20_new2.items():
    top_20_new2[k] = [np.int32(x) for x in v[:30]]

CPU times: user 18.5 s, sys: 12.2 ms, total: 18.5 s
Wall time: 18.5 s


In [15]:
%%time
top_40_day = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v162_d0_0_LB.pkl', 'rb'))
for k,v in top_40_day.items():
    top_40_day[k] = [np.int32(x) for x in v[:40]]

CPU times: user 1.83 s, sys: 19.8 ms, total: 1.85 s
Wall time: 1.85 s


In [16]:
%%time
top_40_day2 = pickle.load(open(f'../../data/covisit_matrices/top_80_aids_v163_d0_0_LB.pkl', 'rb'))
for k,v in top_40_day2.items():
    top_40_day2[k] = [np.int32(x) for x in v[:40]]

CPU times: user 8.33 s, sys: 27.9 ms, total: 8.36 s
Wall time: 8.37 s


In [17]:
len( top_20) #, len( top_20_orders ), len( top_20_buy2buy )

1837262

In [18]:
import itertools

type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_aids(df):
    
    #aids=df.aid.tolist()
    #types = df.type.tolist()
    
    session = df[0]
    aids = df[1]
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds2 = df[6]
    #days = df[7]
    
    top_day = top_40_day2
    click_aids = click_df[session][:ITEM_CT3]
    
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    #df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session
    unique_aids3 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds2[i] == 1][::-1] ))
    
    #mx = df.d.max()
    #aids2 = df.loc[df.d==mx].aid.tolist()
    #unique_aids4 = list(dict.fromkeys(aids2[::-1] ))
    mx = np.max(ds)
    unique_aids4 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds[i] == mx][::-1] ))
    
    #aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    #unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day 
    mx = np.max(tss)
    unique_aids6 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60*24 ][::-1] ))
    
    #df = df.loc[ df['type'].isin([1,2]) ]
    #unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    ln = len(unique_aids)
 
    if len(unique_aids)>=15:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        aids3 = list(itertools.chain(*[top_20c[aid][:20*2] for aid in unique_aids[:2] if aid in top_20c]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
        aids3 = list(itertools.chain(*[top_20b[aid][:15*2] for aid in unique_aids3 if aid in top_20b]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.3
        aids3 = list(itertools.chain(*[top_20_test2[aid][:20*2] for aid in unique_aids[:2] if aid in top_20_test2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.6
            
        #aids3 = list(itertools.chain(*[top_20[aid][:10] for aid in click_aids[:5] if aid in top_20]))
        #for i,aid in enumerate(aids3):
        #    aids_temp[aid] += 0.3
                
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return session, (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    aids_temp = Counter() 
    
    # NEW
    MM = 4
    aids2 = list(itertools.chain(*[top_day[aid][:10*MM] for aid in unique_aids6 if aid in top_day]))     
    for i,aid in enumerate( aids2 ):
        aids_temp[aid] += 1  
        
    # NEW NEW
    #ln0 = len(click_aids)
    aids2 = list(itertools.chain(*[top_20[aid][:20] for aid in click_aids if aid in top_20]))
    for i,aid in enumerate(aids2):
        aids_temp[aid] += 0.5
        #if i%20==0: aids_temp[aid] += 0.5 
    
    weights3 = [2,2] + [1]*28 
    if len(unique_aids)==1:
        aids5 = list(itertools.chain(*[top_20_new2[aid][:30] for aid in unique_aids[-1:] if aid in top_20_new2]))
        w5 = weights3* int(len(aids5)//30)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
            
    #aids2 = list(itertools.chain(*[top_20[aid][:20*2] for aid in unique_aids if aid in top_20]))
    #for i,aid in enumerate(aids2):
    #    m = 0.1 + 0.9*(ln-(i//(20*2)))/ln
    #    aids_temp[aid] += m
    #    if i%(20*2)==0: aids_temp[aid] += m
         
    # FROM GIBA
    for i, a in enumerate(unique_aids):
        w0 = np.max([1 - (0.35 * i), 0.001]) #Weight aid order starting from the last one. 
        if a in top_20:
            for j, aj in enumerate(top_20[a]):
                w1 = np.max([1 - (0.005 * j), 0.01]) #Weight the candidate aid from the dict
                aids_temp[aj] += (w0*w1)
                            
    aids3 = list(itertools.chain(*[top_20b[aid][:20*2] for aid in unique_aids[:2] if aid in top_20b]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
            
    aids3 = list(itertools.chain(*[top_20_test2[aid][:20*2] for aid in unique_aids[:2] if aid in top_20_test2]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
            
    # TRY GIBA HERE
    aids4 = list(itertools.chain(*[top_20f[aid][:10*2] for aid in unique_aids4 if aid in top_20f]))
    for i,aid in enumerate(aids4):
        w = i//(10*2)
        aids_temp[aid] += 1 -w*0.1
        if i%(10*2)==0: aids_temp[aid] += 1 -w*0.1
            
    aids5 = list(itertools.chain(*[top_20e[aid][:20*2] for aid in unique_aids3 if aid in top_20e]))
    for i,aid in enumerate(aids5):
        aids_temp[aid] += 1
        if i%(20*2)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(1) if k not in unique_aids]
    
    aids3 = list(itertools.chain(*[top_20c[aid][:10*2] for aid in top_aids2 if aid in top_20c]))
    for i,aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i%(10*2)==0: aids_temp[aid] += 1
    top_aids2 = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + top_aids2[:ITEM_CT - len(unique_aids)]
    return session, (result + top_clicks[:ITEM_CT-len(result)])[:ITEM_CT]

def suggest_orders(df):
    
    session = df[0]
    aids = df[1]
    aids9 = aids.copy()
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds1 = df[5]
    ds2 = df[6]
    days = df[7]
    
    #top_day = top_40_day[ df.day.values[0] ]
    top_day = top_40_day
    #click_aids = click_df[df.session.values[0]][:ITEM_CT2]
    click_aids = click_df[session][:ITEM_CT3]
    
    #aids = df.aid.tolist()
    #types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    
    #mx = df.d.max()
    #aids2 = df.loc[df.d==mx].aid.tolist()
    #unique_aids4 = list(dict.fromkeys(aids2[::-1] )) # last session
    mx = np.max(ds)
    unique_aids4 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds[i] == mx][::-1] ))
    
    #mx = df.ts.max()
    #aids2 = df.loc[df.ts >= mx - 60*60/2].aid.tolist()
    #unique_aids5 = list(dict.fromkeys(aids2[::-1] )) #recent 1 hour  
    mx = np.max(tss)
    unique_aids5 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60/2 ][::-1] ))
    
    #aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    #unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day 
    unique_aids6 = list(dict.fromkeys( [f for i, f in enumerate(aids) if tss[i] >= mx - 60*60*24 ][::-1] ))
    
    #df2 = df.drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids2 = list(dict.fromkeys(aids2[::-1] )) #first of each session
    unique_aids2 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds1[i] == 1][::-1] ))
    
    #df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    #aids2 = df2.aid.tolist()
    #unique_aids3 = list(dict.fromkeys(aids2 )) #last of each session
    unique_aids3 = list(dict.fromkeys( [f for i, f in enumerate(aids) if ds2[i] == 1][::-1] ))
    
    #df = df.loc[ df['type'].isin([1,2]) ]
    #unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(dict.fromkeys( [f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1] ))
    
    if len(unique_aids)>=20:

        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        for aid in unique_aids2: 
            aids_temp[aid] += 0.5
        for aid in unique_aids3: 
            aids_temp[aid] += 0.5
            
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:40] for aid in unique_buys if aid in top_20_buy2buy]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids3 = list(itertools.chain(*[top_20_buy2buy2[aid][:40] for aid in unique_buys if aid in top_20_buy2buy2]))
        for i,aid in enumerate(aids3):
            aids_temp[aid] += 0.1
            if i%40==0: aids_temp[aid] += 0.1
                
        aids4 = list(itertools.chain(*[top_20_test[aid][:40] for aid in unique_aids if aid in top_20_test]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids5 = list(itertools.chain(*[top_20c[aid][:20] for aid in unique_aids[:1] if aid in top_20c]))
        for i,aid in enumerate(aids5):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        aids6 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
        for i,aid in enumerate(aids6):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
                
        aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids3 if aid in top_20b]))
        for i,aid in enumerate(aids7):
            aids_temp[aid] += 0.25
            if i%5==0: aids_temp[aid] += 0.25
        aids7 = list(itertools.chain(*[top_20b[aid][:5] for aid in unique_aids2 if aid in top_20b]))
        for i,aid in enumerate(aids7):
            aids_temp[aid] += 0.125
            if i%5==0: aids_temp[aid] += 0.125
                
           
        aids4 = list(itertools.chain(*[top_day[aid][:40] for aid in unique_aids6 if aid in top_day]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%40==0: aids_temp[aid] += 0.05
        aids4 = list(itertools.chain(*[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        aids4 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i%20==0: aids_temp[aid] += 0.05
        for aid in click_aids:
            aids_temp[aid] += 0.05
            
        # NEW STUFF 
        aids4 = list(itertools.chain(*[top_40_more[aid][:5] for aid in unique_aids if aid in top_40_more]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_more2[aid][:5] for aid in unique_aids if aid in top_40_more2]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_less[aid][:5] for aid in unique_aids if aid in top_40_less]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
        aids4 = list(itertools.chain(*[top_40_less2[aid][:5] for aid in unique_aids if aid in top_40_less2]))
        for i,aid in enumerate(aids4):
            aids_temp[aid] += 0.03
            if i%5==0: aids_temp[aid] += 0.03
            
        result = [k for k,v in aids_temp.most_common(ITEM_CT)]
        return session, (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]
        #return sorted_aids 
    
    weights = [2,2] + [1]*8 #+ [0]*30
    weights2 = [2,2] + [1]*53 #+ [0]*25
    weights3 = [2,2] + [1]*18 #+ [0]*70
    weights4 = [2,2] + [1]*38 #+ [0]*70
    weights5 = [2,2] + [1]*28 #+ [0]*70
    
    ln = len(unique_aids)
    
    MM = 3
    aids_temp = Counter() 
    aids2 = list(itertools.chain(*[top_20_orders[aid][:10*MM] for aid in unique_aids if aid in top_20_orders]))
    w2 = weights5* int(len(aids2)//(10*MM))
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid][:10*MM] for aid in unique_buys if aid in top_20_buy2buy]))
    w3 = weights5* int(len(aids3)//(10*MM))
    aids4 = list(itertools.chain(*[top_20_test[aid][:10*MM] for aid in unique_aids if aid in top_20_test]))
    w4 = weights5* int(len(aids4)//(10*MM))
    aids5 = list(itertools.chain(*[top_20_buy2buy2[aid][:10*MM] for aid in unique_buys if aid in top_20_buy2buy2]))
    w5 = weights5* int(len(aids5)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    for i,(aid,w) in enumerate(zip(aids3,w3)):
        aids_temp[aid] += w/2
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        aids_temp[aid] += w/2
      
    # NEW
    MM = 1
    top_40_use = top_40_more
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    MM = 1
    top_40_use = top_40_less
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
        
    # NEW
    MM = 1
    top_40_use = top_40_more2
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
    MM = 1
    top_40_use = top_40_less2
    aids2 = list(itertools.chain(*[top_40_use[aid][:10*MM] for aid in unique_aids if aid in top_40_use]))
    w2 = weights* int(len(aids2)//(10*MM))
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += w*m
                      
    MM = 4
    aids2 = list(itertools.chain(*[top_day[aid][:10*MM] for aid in unique_aids6 if aid in top_day]))
    w2 = weights4* int(len(aids2)//(10*MM))        
    for i,(aid,w) in enumerate(zip(aids2,w2)):
        m = 0.25 + 0.75*(ln-(i//(10*MM)))/ln
        aids_temp[aid] += 1 #w*m     
        
    ln0 = len(click_aids)
    aids4 = list(itertools.chain(*[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]))
    w4 = weights3* int(len(aids4)//(20))
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln0-(i//(20)))/ln0
        aids_temp[aid] += w*m
    aids4 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]))
    w4 = weights3* int(len(aids4)//(20))
    for i,(aid,w) in enumerate(zip(aids4,w4)):
        m = 0.25 + 0.75*(ln0-(i//(20)))/ln0
        aids_temp[aid] += w*m
    for aid in click_aids:
        aids_temp[aid] += 1
        
    aids5 = list(itertools.chain(*[top_20c[aid][:55] for aid in unique_aids[:1] if aid in top_20c]))
    w5 = weights2* int(len(aids5)//55)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += w
        
    if len(unique_aids)==1:
        aids5 = list(itertools.chain(*[top_20_new2[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new2]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
        aids5 = list(itertools.chain(*[top_20_new[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new]))
        w5 = weights3* int(len(aids5)//20)
        for aid,w in zip(aids5,w5):
            aids_temp[aid] += w
        
    aids5 = list(itertools.chain(*[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]))
    w5 = weights3* int(len(aids5)//20)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += w
        
    ln2 = len(unique_aids5)
    aids5 = list(itertools.chain(*[top_20_buy[aid][:20] for aid in unique_aids5 if aid in top_20_buy]))
    w5 = weights3* int(len(aids5)//20)
    for aid,w in zip(aids5,w5):
        aids_temp[aid] += 2*w/ln2
        
    aids4 = list(itertools.chain(*[top_20f[aid][:5] for aid in unique_aids4 if aid in top_20f]))
    for i,aid in enumerate(aids4):
        w = i//5
        aids_temp[aid] += 1/2 -w*0.05
        if i%5==0: aids_temp[aid] += 1/2 -w*0.05
    aids5 = list(itertools.chain(*[top_20e[aid][:55] for aid in unique_aids3 if aid in top_20e]))
    w5 = weights2* int(len(aids5)//55)
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        w2 = i//55
        aids_temp[aid] += w -w2*0.1
    aids5 = list(itertools.chain(*[top_20e[aid][:10] for aid in unique_aids2 if aid in top_20e]))
    w5 = weights* int(len(aids5)//10)
    for i,(aid,w) in enumerate(zip(aids5,w5)):
        w2 = i//10
        aids_temp[aid] += w/2. -w2*0.05
                            
    sorted_aids = [k for k,v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    
    result = unique_aids + sorted_aids[:ITEM_CT - len(unique_aids)]
    return session, (result + top_orders[:ITEM_CT-len(result)])[:ITEM_CT]

In [19]:
import psutil
N_CORES = psutil.cpu_count()     
print(f"N Cores : {N_CORES}")
from multiprocessing import Pool

N Cores : 40


In [20]:
N_CORES = 20
def df_parallelize_run(func, t_split):
    
    num_cores = np.min([N_CORES, len(t_split)])
    pool = Pool(num_cores)
    df = pool.map(func, t_split)
    pool.close()
    pool.join()
    
    return df

In [21]:
%%time
PIECES = 10
valid_bysession_list = []
for PART in range(PIECES):
    with open(f'../../data/infer_data/lists/test_group_tolist_{PART}_1.pkl', 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
print(len(valid_bysession_list))

1671803
CPU times: user 50.2 s, sys: 140 ms, total: 50.4 s
Wall time: 50.3 s


In [22]:
%%time
import pickle
click_df = pickle.load(open('../../data/candidate_scores/clicks_v620_LB.pkl', 'rb'))

CPU times: user 17.1 s, sys: 1.72 s, total: 18.9 s
Wall time: 18.9 s


In [23]:
%%time
temp = df_parallelize_run(suggest_aids, valid_bysession_list)
val_clicks = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 59.7 s, sys: 14.7 s, total: 1min 14s
Wall time: 3min


In [24]:
del top_20
gc.collect()

0

In [25]:
%%time
top_20_orders = pickle.load(open('../../data/covisit_matrices/top_40_orders_carts_v118.pkl', 'rb')) #13 #UPGRADED
for k,v in top_20_orders.items():
    top_20_orders[k] = [np.int32(x) for x in v[:30]]
top_20_carts = top_20_orders 

top_20_buy2buy = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v16.pkl', 'rb'))
for k,v in top_20_buy2buy.items():
    top_20_buy2buy[k] = [np.int32(x) for x in v]
    
top_20_buy2buy2 = pickle.load(open('../../data/covisit_matrices/top_40_buy2buy_v119.pkl', 'rb')) #NEW
for k,v in top_20_buy2buy2.items():
    top_20_buy2buy2[k] = [np.int32(x) for x in v]
    
top_20_test = pickle.load(open('../../data/covisit_matrices/top_40_aids_v35.pkl', 'rb'))
for k,v in top_20_test.items():
    top_20_test[k] = [np.int32(x) for x in v]

top_20_buy = pickle.load(open('../../data/covisit_matrices/top_20_aids_v37.pkl', 'rb'))
for k,v in top_20_buy.items():
    top_20_buy[k] = [np.int32(x) for x in v[:20]]
    
top_20_new = pickle.load(open('../../data/covisit_matrices/top_40_aids_v800_0.pkl', 'rb'))
for k,v in top_20_new.items():
    top_20_new[k] = [np.int32(x) for x in v[:20]]
    
gc.collect()

CPU times: user 3min 1s, sys: 3.87 s, total: 3min 5s
Wall time: 3min 5s


0

In [30]:
%%time
top_40_less = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v178_0.pkl', 'rb')) 
for k,v in top_40_less.items(): top_40_less[k] = [np.int32(x) for x in v[:10]]
    
top_40_more = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v179_0.pkl', 'rb'))
top_40_more.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v179_1.pkl', 'rb')) )
for k,v in top_40_more.items(): top_40_more[k] = [np.int32(x) for x in v[:10]]

CPU times: user 11.1 s, sys: 1.04 s, total: 12.1 s
Wall time: 12.1 s


In [31]:
%%time
top_40_less2 = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v180_0.pkl', 'rb')) 
for k,v in top_40_less2.items(): top_40_less2[k] = [np.int32(x) for x in v[:10]]
    
top_40_more2 = pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v181_0.pkl', 'rb'))
top_40_more2.update( pickle.load(open(f'../../data/covisit_matrices/top_40_aids_v181_1.pkl', 'rb')) )
for k,v in top_40_more2.items(): top_40_more2[k] = [np.int32(x) for x in v[:10]]

CPU times: user 48.2 s, sys: 4.71 s, total: 52.9 s
Wall time: 53.7 s


In [32]:
%%time
temp = df_parallelize_run(suggest_orders, valid_bysession_list)
val_buys = pd.Series([f[1]  for f in temp], index=[f[0] for f in temp])

CPU times: user 1min 3s, sys: 19.3 s, total: 1min 22s
Wall time: 2min 47s


# Create Submission File

In [33]:
pred_df = val_clicks
pred_df_orders = val_buys

In [34]:
clicks_pred_df = pd.DataFrame(pred_df.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_carts"), columns=["labels"]).reset_index()

In [35]:
%%time
pred_df = pd.concat(
    [clicks_pred_df, orders_pred_df, carts_pred_df]
)
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv(f"../../data/candidate_scores/submission_v{VER}{OFFSET}{POSTFIX}.csv", index=False)

CPU times: user 5min 48s, sys: 4.37 s, total: 5min 52s
Wall time: 5min 53s
