In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
#!pip install pandarallel

In [3]:
model_dicts = {
    0: './hpo_results/hpo4/29422980-5938-4f70-9030-d5ce69d08da3/',
    1: './hpo_results/hpo5/1e45f568-fced-4da9-a54b-109756358ce7/',
    2: './hpo_results/hpo5/c98d747e-b09f-4a15-8004-0cd1a138b093/',
    3: './hpo_results/hpo5/ffe7024d-64b6-4489-8542-3fba3063d27d/',
    4: './hpo_results/hpo5/3a62c807-971e-4517-9dbb-f67f296873e6/'
}

In [4]:
import cudf
import pandas as pd
import numpy as np

import glob
import gc
import os

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False, nb_workers=20, use_memory_fs=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from numpy import dot
from numpy.linalg import norm

def calc_cosine(a,b):
    if isinstance(a, dict):
        if not (a in emb.keys() and b in emb.keys()):
            return(0.0)
    
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return(cos_sim)

In [7]:
import pickle

In [8]:
df_type = cudf.DataFrame({
    'type': ['clicks', 'carts', 'orders'],
    'type_': [0, 1, 2]
})

In [9]:
df = cudf.read_parquet('./data/xgb_train_x.parquet')
df = df.merge(
    df_type,
    how='left',
    on='type'
)
df['session'] = df['session'].astype('int32')
df['aid'] = df['aid'].astype('int32')
df.ts = (df.ts/1000).astype('int32')
df.drop(['type'], axis=1, inplace=True)
df = df.rename(columns={'type_': 'type'})
df = df.merge(
    df[['session']].groupby(['session']).size().reset_index().rename(columns={0: 'session_len'}),
    how='left',
    on='session'
)
df = df.merge(
    df[['session', 'ts']].groupby(['session']).max().reset_index().rename(columns={'ts': 'ts_max'}),
    how='left',
    on='session'
)
df = df.sort_values(['session', 'ts'], ascending=[True, False])
df['dummy'] = 1
df['rank'] = df.groupby(['session']).dummy.cumsum()
df.drop(['dummy'], axis=1, inplace=True)
min_val = (2 ** 0.1-1)
max_val = (2 ** 1-1)
df['wgt_1'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))
min_val = (2 ** 0.5-1)
max_val = (2 ** 1-1)
df['wgt_2'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))
df = df.sort_values(["session", "ts"])
df['d'] = df.groupby('session').ts.diff()
df.d = (df.d > 60*60*2).astype('int16').fillna(0)
df.d = df.groupby('session').d.cumsum()
df = df.merge(
    df.groupby(['session', 'd']).ts.max().reset_index().rename(columns={'ts': 'ts_max_2'}),
    how='left',
    on=['session', 'd']
)
df['type_w'] = df['type'].isin([1,2])
df['last'] = df['ts']==df['ts_max']
df.drop(['ts', 'type', 'session_len', 'ts_max', 'rank', 'ts_max_2'], axis=1, inplace=True)
df_session_ts = df.copy()

In [11]:
%%time

for ty in ['clicks', 'carts', 'orders']:
    files = [y for x in [
        sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/chunks/' + ty + '/chunk*.parquet'))
        for igfold2 in range(0,5)
    ] for y in x ]
    for file in files:
        print(file)
        igfold = int(file.split('/')[2].replace('fold_', ''))
        print(igfold)
        emb = pickle.load(open('./data_folds/fold_' + str(igfold) + '/word2vec.emb', 'rb'))
        #os.system('mkdir -p ' +'/'.join(file.split('/')[:-1]).replace('/chunks/', '/chunks_emb/'))
        df = cudf.read_parquet(file, columns=['session', 'cand'])
        df = df.merge(
            df_session_ts,
            how='left',
            on=['session']
        )
        df_pairs = df[['cand', 'aid']].drop_duplicates().to_pandas()
        #df_pairs = df_pairs[df_pairs['cand']!=df_pairs['aid']]
        df_pairs['sim'] = df_pairs.parallel_apply(
            lambda x: calc_cosine(emb[x['cand']], emb[x['aid']]), axis=1)
        df_pairs = df_pairs[df_pairs['sim']>0.5]
        df = df.merge(
            cudf.from_pandas(df_pairs),
            how='left',
            on=['cand', 'aid']
        )
        df['sim'] = df['sim'].fillna(0)
        df['sim_1'] = df['sim'].values
        df['sim_2'] = df['sim'].values
        df['sim_3'] = df['sim']>0
        df['sim_wgt_1'] = df['sim']*df['wgt_1']
        df['sim_wgt_2'] = df['sim']*df['wgt_2']
        df['sim_last'] = df['sim']*df['last']
        df['sim_last'] = df['sim']*df['last']
        df['sim_type_1'] = df['sim']*df['type_w']
        df = df.groupby([
            'session',
            'cand'
        ]).agg({
            'sim_1': 'max',
            'sim_2': 'sum',
            'sim_3': 'sum',
            'sim_wgt_1': 'sum',
            'sim_wgt_2': 'sum',
            'sim_last': 'max',
            'sim_type_1': 'sum'
        })
        for col in ['sim_2', 'sim_wgt_1', 'sim_wgt_2', 'sim_type_1']:
            df[col] = df[col]/(1+df['sim_3'])
        df = df.reset_index()
        df_full = cudf.read_parquet(file)
        df_full = df_full.merge(
            df,
            how='left',
            on=['session', 'cand']
        )
        if ty == 'clicks':
            df_full.to_pandas().to_parquet(file)
        else:
            df_full.to_parquet(file)
        del df_full
        del df
        gc.collect()

./data_folds/fold_0/split/chunks/carts/chunk_0.parquet
0
./data_folds/fold_0/split/chunks/carts/chunk_1.parquet
0
./data_folds/fold_1/split/chunks/carts/chunk_0.parquet
1
./data_folds/fold_1/split/chunks/carts/chunk_1.parquet
1
./data_folds/fold_2/split/chunks/carts/chunk_0.parquet
2
./data_folds/fold_2/split/chunks/carts/chunk_1.parquet
2
./data_folds/fold_3/split/chunks/carts/chunk_0.parquet
3
./data_folds/fold_3/split/chunks/carts/chunk_1.parquet
3
./data_folds/fold_4/split/chunks/carts/chunk_0.parquet
4
./data_folds/fold_4/split/chunks/carts/chunk_1.parquet
4
CPU times: user 1min 37s, sys: 1min 53s, total: 3min 30s
Wall time: 17min 21s


In [11]:
df = cudf.read_parquet('../../data/test.parquet')
df = df.merge(
    df_type,
    how='left',
    on='type'
)
df['session'] = df['session'].astype('int32')
df['aid'] = df['aid'].astype('int32')
df.ts = (df.ts/1000).astype('int32')
df.drop(['type'], axis=1, inplace=True)
df = df.rename(columns={'type_': 'type'})
df = df.merge(
    df[['session']].groupby(['session']).size().reset_index().rename(columns={0: 'session_len'}),
    how='left',
    on='session'
)
df = df.merge(
    df[['session', 'ts']].groupby(['session']).max().reset_index().rename(columns={'ts': 'ts_max'}),
    how='left',
    on='session'
)
df = df.sort_values(['session', 'ts'], ascending=[True, False])
df['dummy'] = 1
df['rank'] = df.groupby(['session']).dummy.cumsum()
df.drop(['dummy'], axis=1, inplace=True)
min_val = (2 ** 0.1-1)
max_val = (2 ** 1-1)
df['wgt_1'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))
min_val = (2 ** 0.5-1)
max_val = (2 ** 1-1)
df['wgt_2'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))
df = df.sort_values(["session", "ts"])
df['d'] = df.groupby('session').ts.diff()
df.d = (df.d > 60*60*2).astype('int16').fillna(0)
df.d = df.groupby('session').d.cumsum()
df = df.merge(
    df.groupby(['session', 'd']).ts.max().reset_index().rename(columns={'ts': 'ts_max_2'}),
    how='left',
    on=['session', 'd']
)
df['type_w'] = df['type'].isin([1,2])
df['last'] = df['ts']==df['ts_max']
df.drop(['ts', 'type', 'session_len', 'ts_max', 'rank', 'ts_max_2'], axis=1, inplace=True)
df_session_ts = df.copy()

In [12]:
files = [y for x in [
    sorted(glob.glob('./data_folds/fold_' + str(igfold2) + '/split/sub/*.parquet'))
    for igfold2 in range(0,5)
] for y in x ]

In [13]:
for file in files:
    print(file)
    igfold = int(file.split('/')[2].replace('fold_', ''))
    print(igfold)
    emb = pickle.load(open('./data_folds/fold_' + str(igfold) + '/word2vec.emb', 'rb'))
    #os.system('mkdir -p ' +'/'.join(file.split('/')[:-1]).replace('/chunks/', '/chunks_emb/'))
    df = cudf.read_parquet(file, columns=['session', 'cand'])
    df = df.merge(
        df_session_ts,
        how='left',
        on=['session']
    )
    df_pairs = df[['cand', 'aid']].drop_duplicates().to_pandas()
    #df_pairs = df_pairs[df_pairs['cand']!=df_pairs['aid']]
    df_pairs['sim'] = df_pairs.parallel_apply(
        lambda x: calc_cosine(emb[x['cand']], emb[x['aid']]), axis=1)
    df_pairs = df_pairs[df_pairs['sim']>0.5]
    df = df.merge(
        cudf.from_pandas(df_pairs),
        how='left',
        on=['cand', 'aid']
    )
    df['sim'] = df['sim'].fillna(0)
    df['sim_1'] = df['sim'].values
    df['sim_2'] = df['sim'].values
    df['sim_3'] = df['sim']>0
    df['sim_wgt_1'] = df['sim']*df['wgt_1']
    df['sim_wgt_2'] = df['sim']*df['wgt_2']
    df['sim_last'] = df['sim']*df['last']
    df['sim_last'] = df['sim']*df['last']
    df['sim_type_1'] = df['sim']*df['type_w']
    df = df.groupby([
        'session',
        'cand'
    ]).agg({
        'sim_1': 'max',
        'sim_2': 'sum',
        'sim_3': 'sum',
        'sim_wgt_1': 'sum',
        'sim_wgt_2': 'sum',
        'sim_last': 'max',
        'sim_type_1': 'sum'
    })
    for col in ['sim_2', 'sim_wgt_1', 'sim_wgt_2', 'sim_type_1']:
        df[col] = df[col]/(1+df['sim_3'])
    df = df.reset_index()
    df_full = cudf.read_parquet(file)
    df_full = df_full.merge(
        df,
        how='left',
        on=['session', 'cand']
    )
    df_full.to_parquet(file)
    del df_full
    del df
    gc.collect()

./data_folds/fold_0/split/sub/chunk_0.parquet
0
./data_folds/fold_0/split/sub/chunk_1.parquet
0
./data_folds/fold_0/split/sub/chunk_10.parquet
0
./data_folds/fold_0/split/sub/chunk_11.parquet
0
./data_folds/fold_0/split/sub/chunk_12.parquet
0
./data_folds/fold_0/split/sub/chunk_13.parquet
0
./data_folds/fold_0/split/sub/chunk_14.parquet
0
./data_folds/fold_0/split/sub/chunk_15.parquet
0
./data_folds/fold_0/split/sub/chunk_16.parquet
0
./data_folds/fold_0/split/sub/chunk_17.parquet
0
./data_folds/fold_0/split/sub/chunk_18.parquet
0
./data_folds/fold_0/split/sub/chunk_19.parquet
0
./data_folds/fold_0/split/sub/chunk_2.parquet
0
./data_folds/fold_0/split/sub/chunk_20.parquet
0
./data_folds/fold_0/split/sub/chunk_21.parquet
0
./data_folds/fold_0/split/sub/chunk_22.parquet
0
./data_folds/fold_0/split/sub/chunk_23.parquet
0
./data_folds/fold_0/split/sub/chunk_24.parquet
0
./data_folds/fold_0/split/sub/chunk_3.parquet
0
./data_folds/fold_0/split/sub/chunk_4.parquet
0
./data_folds/fold_0/split