# Covisitations matrices


In [1]:
VALIDA = True

In [3]:
DEBUG = False

LEAK_DATA = True
SAVE_DICT_FORMAT = False

TYPE_WEIGHT_FULLTYPE = {0:1, 1:6, 2:3}
TYPE_WEIGHT_PURCHASE = {0:1, 1:1, 2:1}
TYPE_WEIGHT_CLICK = None # use time weight

DELTA_TS_FULLTYPE = 24
DELTA_TS_CLICK = 24 # hours
DELTA_TS_BUY = 24*7

LENGTH_TAIL_SESSION = 30
TOP_K = 20

MIN_CLICK = 1.0
MIN_CO = 1.0
MIN_TS = 1_659_304_800

# Train max_ts: 1_662_328_651
# Valida max_ts: 1_661_723_996
# Test max_ts: 1_662_328_651
if VALIDA:
    MAX_TS = 1_661_723_996 if LEAK_DATA else 1_661_119_199
else:
    MAX_TS = 1_662_328_791 if LEAK_DATA else 1_661_723_999

CUTOFF_TS = True # if set equals to true, cut off the first week of training set

In [4]:
import pandas as pd 
import numpy as np
import datetime
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

We will use RAPIDS version 21.10.01


In [36]:
print("Starting day [TRAIN]: ", datetime.datetime.fromtimestamp(MIN_TS))
print("Ending day [TRAIN]: ", datetime.datetime.fromtimestamp(MAX_TS), '-', datetime.datetime.fromtimestamp(1661723999)) 

max_ts_first_week = int((1_661_723_999 - 1_659_304_800+1)/4 + 1_659_304_800)
print("Ending day at the first week[TRAIN]: ", datetime.datetime.fromtimestamp(max_ts_first_week))

Starting day [TRAIN]:  2022-07-31 22:00:00
Ending day [TRAIN]:  2022-08-28 21:59:56 - 2022-08-28 21:59:59
Ending day at the first week[TRAIN]:  2022-08-07 22:00:00


In [6]:
%%time
# CACHE FUNCTIONS
def read_file(f):
    return cudf.DataFrame( data_cache[f] )

def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
if LEAK_DATA:
    files = glob.glob('/kaggle/input/otto-chunk-data-inparquet-format/*_parquet/*')
else:
    files = glob.glob('/kaggle/input/otto-chunk-data-inparquet-format/train_parquet/*')

        
for f in tqdm(files): 
    data_cache[f] = read_file_to_cache(f)

# CHUNK PARAMETERS
READ_CT = 5 # sub-chunk
CHUNK = int( np.ceil( len(files)/6 ))
print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')

  0%|          | 0/146 [00:00<?, ?it/s]

We will process 146 files, in groups of 5 and chunks of 25.
CPU times: user 1min 7s, sys: 11.4 s, total: 1min 19s
Wall time: 1min 25s


In [7]:
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
def humansize(nbytes):
    i = 0
    while nbytes >= 1024 and i < len(suffixes)-1:
        nbytes /= 1024.
        i += 1
    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
    return '%s %s' % (f, suffixes[i])

In [8]:
def building_covisitations(df, TYPE_COO,
                           PART, SIZE,
                           TYPE_WEIGHT=None, DELTA_TS = None, MIN_TS = None, MAX_TS = None):
    
    df = df.loc[df.aid_x != df.aid_y]
    
    if TYPE_COO == "fulltype":
        df = df.loc[(df.ts_x - df.ts_y).abs() < DELTA_TS * 60 * 60]
    
    elif TYPE_COO == 'click':
        df = df.loc[((df.ts_x - df.ts_y).abs() < DELTA_TS * 60 * 60) & \
                    (df.type_y == 0)]
        
        df['wgt'] = ((df.ts_x - MIN_TS)/(MAX_TS - MIN_TS)).astype('float32')
            
    elif TYPE_COO == 'cart':
        df = df.loc[((df.ts_x - df.ts_y).abs()< DELTA_TS_BUY * 60 * 60) & \
                    (df.aid_y != 0)]
        
    elif TYPE_COO == 'purchase':            
        df = df.loc[((df.ts_x - df.ts_y).abs()< DELTA_TS_BUY * 60 * 60) & \
                    (df.aid_y != 0) & (df.aid_y != 1)]
    
    if TYPE_COO != 'click':
        df['wgt'] = df.type_y.map(TYPE_WEIGHT).astype('float32')
        
    df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
    df = df.drop_duplicates(['session', 'aid_x', 'aid_y'])
    
    df = df[['aid_x','aid_y','wgt']].groupby(['aid_x','aid_y']).wgt.sum()
    
    return df

In [9]:
def save_full_data(OUTPUT_NAME, DISK_PIECES):
    print("Saving full_parquet and full_dict...", end='')
    full_parquet = []
    full_dict = dict()
    for part in range(DISK_PIECES):
        sub_parquet = pd.read_parquet(f'/kaggle/working/{OUTPUT_NAME}_{part}.pqt')
        full_parquet.append(sub_parquet)
        if SAVE_DICT_FORMAT:
            full_dict.update(sub_parquet.groupby("aid_x").apply(lambda df: Counter(dict(zip(df.aid_y, df.wgt)))))
        os.remove(f'/kaggle/working/{OUTPUT_NAME}_{part}.pqt')
        if DEBUG:
            break
        del sub_parquet
        gc.collect()
    full_parquet = pd.concat(full_parquet, axis=0, ignore_index=True)
    full_parquet.to_parquet(f'/kaggle/working/{OUTPUT_NAME}_full.pqt')
    if SAVE_DICT_FORMAT:
        with open(f'/kaggle/working/{OUTPUT_NAME}_full.pickle', 'wb') as file:
            pickle.dump(full_dict, file)
    print("Saved!")

    del full_parquet, full_dict
    gc.collect()

In [10]:
def create_covisitation_matrices(TYPE_COO,
                                 TYPE_WEIGHT=None, CUTOFF_TS=False, DELTA_TS=None, 
                                 MIN_TS=None, MAX_TS=None, DISK_PIECES=5):
    
    SIZE = 1.86e6/DISK_PIECES
    OUTPUT_NAME = f"top_{TOP_K}_{TYPE_COO}_{DELTA_TS}hours"
    
    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    # 146 files -> 6 CHUNK SIZE 25 -> 1 CHUNK: 5 GROUP SIZE 5
    # 1.86e6 aids -> 4 PART

    for PART in range(DISK_PIECES):
        print()
        print('### DISK PART',PART+1)

        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # => OUTER CHUNKS
        for j in range(6):
            a, b = j*CHUNK, min( (j+1)*CHUNK, len(files) )
            print(f'\nProcessing files {a} thru {b-1} in groups of {READ_CT}...', end=' ')

            # => INNER CHUNKS
            for k in range(a,b,READ_CT):

                df_list = [read_file(files[k])]
                for i in range(1,READ_CT): 
                    if k+i<b: df_list.append( read_file(files[k+i]) )
                df = cudf.concat(df_list,ignore_index=True,axis=0)

                if CUTOFF_TS:
                    df = df.loc[df.ts >= max_ts_first_week]
                
                if TYPE_COO == 'cart':
                    df = df.loc[df['type'].isin([0,1])]
                elif TYPE_COO == 'purchase':
                    pass
                    #df = df.loc[df['type'].isin([1,2])]
                
                df = (
                    df
                    .sort_values(['session','ts'],ascending=[True,False])
                    .reset_index(drop=True)
                )

                # USE TAIL OF SESSION
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<LENGTH_TAIL_SESSION].drop('n',axis=1)
                
                df = df.merge(df,on='session')

                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = building_covisitations(df, TYPE_COO, PART, SIZE,
                                            TYPE_WEIGHT=TYPE_WEIGHT, 
                                            DELTA_TS = DELTA_TS, 
                                            MIN_TS = MIN_TS, MAX_TS = MAX_TS)
                
                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
                print(k,', ',end='')

            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()

            if DEBUG and j >= 1:
                break
        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])

        # SAVE TOP K
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = (
            tmp.loc[tmp.n<TOP_K]
            .drop('n',axis=1)
            .reset_index(drop=True)
        )

        # SAVE PART TO DISK (convert to pandas first uses less memory)
        print("\nSaving sub_parquet...", f'{OUTPUT_NAME}_{PART}.pqt', end ='')
        tmp.to_pandas().to_parquet(f'/kaggle/working/{OUTPUT_NAME}_{PART}.pqt')
        print("Saved!")
        del tmp
        gc.collect()
        if DEBUG and PART >= 1:
            break
        
    save_full_data(OUTPUT_NAME, DISK_PIECES)

# Co-visitation: co-occurence
- duration < 1 day
- top 20/40
- the last 30/? action per session

In [11]:
%%time
print('------------ FULL_TYPE ------------')
TYPE_COO = 'fulltype'
DELTA_TS = DELTA_TS_FULLTYPE

create_covisitation_matrices(
    TYPE_COO = TYPE_COO, 
    CUTOFF_TS = CUTOFF_TS,
    TYPE_WEIGHT = TYPE_WEIGHT_FULLTYPE,
    DELTA_TS = DELTA_TS_FULLTYPE)

------------ FULL_TYPE ------------

### DISK PART 1

Processing files 0 thru 24 in groups of 5... 

  "When using a sequence of booleans for `ascending`, "


0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 , 145 , 
Saving sub_parquet... top_20_fulltype_24hours_0.pqtSaved!

### DISK PART 2

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 , 145 , 
Saving sub_parquet... top_20_fulltype_24hours_1.pqtSaved!

### DISK PART 3

Process

# Co-visitation: click
- type_x = 0 & type_y = 0
- delta ts = 1 day

In [12]:
%%time
print('------------ CLICK ------------')
TYPE_COO = 'click'
DELTA_TS = DELTA_TS_CLICK

create_covisitation_matrices(
    TYPE_COO = TYPE_COO, 
    CUTOFF_TS = CUTOFF_TS,
    DELTA_TS = DELTA_TS_CLICK,
    MIN_TS = MIN_TS,
    MAX_TS = MAX_TS)

------------ CLICK ------------

### DISK PART 1

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 , 145 , 
Saving sub_parquet... top_20_click_24hours_0.pqtSaved!

### DISK PART 2

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 

# Co-visitation: cart

In [13]:
%%time
print('------------ CART ------------')
TYPE_COO = 'cart'
DELTA_TS = DELTA_TS_BUY

create_covisitation_matrices(
    TYPE_COO = TYPE_COO, 
    CUTOFF_TS = CUTOFF_TS,
    DELTA_TS = DELTA_TS_BUY,
    TYPE_WEIGHT = TYPE_WEIGHT_PURCHASE)

------------ CART ------------

### DISK PART 1

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 , 145 , 
Saving sub_parquet... top_20_cart_168hours_0.pqtSaved!

### DISK PART 2

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 ,

# Co-visitation: purchase
- delta ts = 1 day

In [None]:
%%time
print('------------ PURCHASE ------------')
TYPE_COO = 'purchase'
DELTA_TS = DELTA_TS_BUY

create_covisitation_matrices(
    TYPE_COO = TYPE_COO,
    CUTOFF_TS = CUTOFF_TS,
    DELTA_TS = DELTA_TS_BUY,
    TYPE_WEIGHT = TYPE_WEIGHT_PURCHASE)

------------ PURCHASE ------------

### DISK PART 1

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135 , 140 , 145 , 
Saving sub_parquet... top_20_purchase_168hours_0.pqtSaved!

### DISK PART 2

Processing files 0 thru 24 in groups of 5... 0 , 5 , 10 , 15 , 20 , 
Processing files 25 thru 49 in groups of 5... 25 , 30 , 35 , 40 , 45 , 
Processing files 50 thru 74 in groups of 5... 50 , 55 , 60 , 65 , 70 , 
Processing files 75 thru 99 in groups of 5... 75 , 80 , 85 , 90 , 95 , 
Processing files 100 thru 124 in groups of 5... 100 , 105 , 110 , 115 , 120 , 
Processing files 125 thru 145 in groups of 5... 125 , 130 , 135

In [28]:
# Fulltype
pd.read_parquet('/kaggle/working/top_20_fulltype_24hours_full.pqt').head()

Unnamed: 0,aid_x,aid_y,wgt
0,0,532042,23.0
1,0,1735605,10.0
2,0,1363081,8.0
3,0,643097,7.0
4,0,1211854,7.0


In [29]:
# Click
pd.read_parquet('/kaggle/working/top_20_click_24hours_full.pqt').head()

Unnamed: 0,aid_x,aid_y,wgt
0,0,532042,5.477554
1,0,643097,3.881037
2,0,706401,2.778403
3,0,1735605,2.75511
4,0,1848174,2.063208


In [31]:
# Cart
pd.read_parquet('/kaggle/working/top_20_cart_168hours_full.pqt').head()

Unnamed: 0,aid_x,aid_y,wgt
0,0,532042,8.0
1,0,643097,7.0
2,0,1735605,5.0
3,0,1848174,5.0
4,0,706401,3.0


In [32]:
# Purchase
pd.read_parquet('/kaggle/working/top_20_purchase_168hours_full.pqt').head()

Unnamed: 0,aid_x,aid_y,wgt
0,0,532042,8.0
1,0,643097,7.0
2,0,1735605,5.0
3,0,1848174,5.0
4,0,706401,3.0
