In [1]:
import cudf
import numpy as np
import gc

import glob

In [2]:
df_type = cudf.DataFrame({
    'type': ['clicks', 'carts', 'orders'],
    'type_': [1, 2, 3],
    'type_mp': [0.5, 9, 0.5]
})


In [3]:
def calc_visits(df):
    df = df.merge(
        df_type,
        how='left',
        on='type'
    )
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df.ts = (df.ts/1000).astype('int32')
    df.drop(['type'], axis=1, inplace=True)
    df = df.rename(columns={'type_': 'type'})
    df = df.merge(
        df[['session']].groupby(['session']).size().reset_index().rename(columns={0: 'session_len'}),
        how='left',
        on='session'
    )
    df = df.merge(
        df[['session', 'ts']].groupby(['session']).max().reset_index().rename(columns={'ts': 'ts_max'}),
        how='left',
        on='session'
    )
    df = df.sort_values(['session', 'ts'], ascending=[True, False])
    df['dummy'] = 1
    df['rank'] = df.groupby(['session']).dummy.cumsum()
    df.drop(['dummy'], axis=1, inplace=True)
    min_val = (2 ** 0.1-1)
    max_val = (2 ** 1-1)
    df['wgt_1'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))*df['type_mp']
    min_val = (2 ** 0.5-1)
    max_val = (2 ** 1-1)
    df['wgt_2'] = (min_val+(max_val-min_val)*(df['rank']-1)/(df['session_len']))*df['type_mp']
    df['ts_diff'] = df['ts_max']-df['ts']
    df['clicks'] = (df['type']==1).astype('int')
    df['carts'] = (df['type']==2).astype('int')
    df['orders'] = (df['type']==3).astype('int')
    df = df[
        ['session', 'aid', 'ts_diff', 'rank', 'wgt_1', 'wgt_2', 'clicks', 'carts', 'orders']
    ].groupby([
        'session', 'aid',
    ]).agg({
        'wgt_1': 'sum',
        'wgt_2': 'sum'
    }).reset_index().rename(columns={
        'wgt_1': 'pv_2_wgt_1', 'wgt_2': 'pv_2_wgt_2', 'aid': 'cand'
    })
    return(df)

In [4]:
!mkdir -p ./data/candidates/train/pv_2/

In [5]:
df = cudf.read_parquet('./data/xgb_train_x.parquet')

In [6]:
df = calc_visits(df)

In [7]:
df['pv_2_wgt_1'] = df['pv_2_wgt_1'].astype('float32')
df['pv_2_wgt_2'] = df['pv_2_wgt_2'].astype('float32')

In [8]:
df.to_parquet('./data/candidates/train/pv_2/cand.parquet')

In [9]:
!mkdir -p ./data/candidates/sub/pv_2/

In [10]:
df = cudf.read_parquet('../../data/test.parquet')

In [11]:
df = calc_visits(df)

In [12]:
df['pv_2_wgt_1'] = df['pv_2_wgt_1'].astype('float32')
df['pv_2_wgt_2'] = df['pv_2_wgt_2'].astype('float32')

In [13]:
df.to_parquet('./data/candidates/sub/pv_2/cand.parquet')

In [14]:
!cp -r ./data/candidates/train/pv_2/ ./data_folds/fold_0/candidates/train/pv_2/
!cp -r ./data/candidates/train/pv_2/ ./data_folds/fold_1/candidates/train/pv_2/
!cp -r ./data/candidates/train/pv_2/ ./data_folds/fold_2/candidates/train/pv_2/
!cp -r ./data/candidates/train/pv_2/ ./data_folds/fold_3/candidates/train/pv_2/
!cp -r ./data/candidates/train/pv_2/ ./data_folds/fold_4/candidates/train/pv_2/

In [16]:
!cp -r ./data/candidates/sub/pv_2/ ./data_folds/fold_0/candidates/sub/pv_2/
!cp -r ./data/candidates/sub/pv_2/ ./data_folds/fold_1/candidates/sub/pv_2/
!cp -r ./data/candidates/sub/pv_2/ ./data_folds/fold_2/candidates/sub/pv_2/
!cp -r ./data/candidates/sub/pv_2/ ./data_folds/fold_3/candidates/sub/pv_2/
!cp -r ./data/candidates/sub/pv_2/ ./data_folds/fold_4/candidates/sub/pv_2/