# Feature Engineering

In [27]:
VALIDA = True

In [28]:
!pip install pyarrow fastparquet

[0m

In [29]:
import pandas as pd
import numpy as np

import time
import datetime
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

# from multiprocessing import Pool
# import psutil
# N_CPU = psutil.cpu_count()
# print("Number of cpu:", N_CPU)

In [30]:
USE_ONLY_GT_SESSION = True
NUM_CHUNK = 5

if VALIDA:
    MIN_TS = 1_661_119_200
    MAX_TS = 1_661_723_999
else:
    MIN_TS = 1_661_724_000
    MAX_TS = 1_662_328_791
    
print("Starting point of valida:", datetime.datetime.fromtimestamp(MIN_TS))
print("Ending point of valida:", datetime.datetime.fromtimestamp(MAX_TS))

Starting point of validA: 2022-08-21 22:00:00
Ending point of validA: 2022-08-28 21:59:59


In [31]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

if VALIDA:
    test_files = sorted(glob.glob('/kaggle/input/otto-validation/test_parquet/*'))
    files = [test_files[i] for i in range(len(test_files))]
else:
    test_files = sorted(glob.glob('/kaggle/input/otto-chunk-data-inparquet-format/test_parquet/*'))
    files = [test_files[i] for i in range(len(test_files))]

In [32]:
if VALIDA:
    dfs = [read_file_to_cache(f) for f in files]
    test_df = pd.concat(dfs, axis=0)
    label_df = pd.read_parquet(f"/kaggle/input/otto-validation/test_labels.parquet")

    del dfs
    _ = gc.collect()

# Historic User and Item features

In [33]:
# features for history candidates
his_df = (
    pd.get_dummies(data=test_df, columns=['type'])
    .groupby(['session','aid'])
    .agg(
        {'type_0':'sum',
         'type_1':'sum',
         'type_2':'sum',
         'ts':['min','max']}
    )
    .reset_index()
)

his_df.columns = ['session','aid','his_num_clicks','his_num_carts','his_num_orders','his_min_ts','his_max_ts']
his_df = his_df.assign(
    his_min_ts = ((his_df['his_min_ts'] - MIN_TS)/(MAX_TS-MIN_TS)).astype('float32'),
    his_max_ts = ((his_df['his_max_ts'] - MIN_TS)/(MAX_TS-MIN_TS)).astype('float32'),
    his_num_clicks = his_df['his_num_clicks'].astype('int32'),
    his_num_carts = his_df['his_num_carts'].astype('int32'),
    his_num_orders = his_df['his_num_orders'].astype('int32'),
    his_num_actions = lambda x: x['his_num_clicks'] + x['his_num_carts'] + x['his_num_orders']
)

print(his_df.shape)
display(his_df.head())

(5535990, 8)


Unnamed: 0,session,aid,his_num_clicks,his_num_carts,his_num_orders,his_min_ts,his_max_ts,his_num_actions
0,11098528,11830,1,0,0,0.0,0.0,1
1,11098529,1105029,1,0,0,0.0,0.0,1
2,11098530,264500,2,0,0,0.0,0.000146,2
3,11098530,409236,3,1,0,0.000279,0.002202,4
4,11098531,396199,2,0,1,0.000112,0.000903,3


# User features

In [34]:
session_features = (
    pd.read_parquet("/kaggle/input/100x-faster-feature-generation/sess_feature.parquet")
    [['session', 'ts_max','ts_min', 'ts_mean']]
)

session_features = session_features.assign(
    ts_max = ((session_features['ts_max'] - MIN_TS)/(MAX_TS-MIN_TS)).astype('float32'),
    ts_min = ((session_features['ts_min'] - MIN_TS)/(MAX_TS-MIN_TS)).astype('float32'),
    ts_mean = ((session_features['ts_mean'] - MIN_TS)/(MAX_TS-MIN_TS)).astype('float32')
)

columns = []
for f in session_features.columns:
    columns.append(f if "sess" in f else ("sess_" + f))
session_features.columns = columns
print(session_features.columns)
print(session_features.shape)
display(session_features.head())

Index(['session', 'sess_ts_max', 'sess_ts_min', 'sess_ts_mean'], dtype='object')
(1801251, 4)


Unnamed: 0,session,sess_ts_max,sess_ts_min,sess_ts_mean
0,11098528,0.0,0.0,0.0
1,11098529,0.0,0.0,0.0
2,11098530,0.002202,0.0,0.00077
3,11098531,0.000903,0.0,0.00038
4,11098532,0.001316,2e-06,0.000659


# Item features

In [35]:
test_item_features = pd.get_dummies(data=test_df, columns=['type'])
test_item_features = (
    test_item_features.groupby(['aid']).agg({'type_0': 'sum', 'type_1': 'sum', 'type_2': 'sum', 'ts': ['min', 'max']})
    .reset_index()
)

test_item_features.columns = \
    ['aid', 'aid_test_num_clicks', 'aid_test_num_carts', 'aid_test_num_orders', 'aid_test_min_ts', 'aid_test_max_ts']

# Step 3: Normalize timestamps and convert to float32
test_item_features = test_item_features.assign(
    aid_test_min_ts = ((test_item_features['aid_test_min_ts'] - MIN_TS) / (MAX_TS - MIN_TS)).astype('float32'),
    aid_test_max_ts = ((test_item_features['aid_test_max_ts'] - MIN_TS) / (MAX_TS - MIN_TS)).astype('float32'),
    # dtypes
    aid_test_num_clicks = test_item_features['aid_test_num_clicks'].astype('int32'),
    aid_test_num_carts = test_item_features['aid_test_num_carts'].astype('int32'),
    aid_test_num_orders = test_item_features['aid_test_num_orders'].astype('int32'),
    aid_test_num_actions = lambda x: x['aid_test_num_clicks'] + x['aid_test_num_carts'] + x['aid_test_num_orders']
)

print(test_item_features.shape)
display(test_item_features.head())

(874852, 7)


Unnamed: 0,aid,aid_test_num_clicks,aid_test_num_carts,aid_test_num_orders,aid_test_min_ts,aid_test_max_ts,aid_test_num_actions
0,0,5,0,0,0.319207,0.975737,5
1,2,4,0,0,0.076072,0.801714,4
2,3,295,27,2,0.002889,0.997386,324
3,4,8,0,0,0.4163,0.925934,8
4,11,3,1,0,0.3692,0.953773,4


In [36]:
# aid features from train + test
item_features = pd.read_parquet("/kaggle/input/100x-faster-feature-generation/aid_features.parquet")
display(item_features.head())
item_features = item_features[['aid','aid_ca_cl_ratio', 'aid_or_cl_ratio','aid_or_ca_ratio']]
print(item_features.shape)
display(item_features.head())

Unnamed: 0,aid,aid_cnt,aid_hm_mean,aid_hm_median,aid_hm_std,aid_day0cnt,aid_day1cnt,aid_day2cnt,aid_day3cnt,aid_day4cnt,...,aid_hour23cnt,aid_cl_cnt,aid_ca_cnt,aid_or_cnt,aid_ca_cl_ratio,aid_or_cl_ratio,aid_or_ca_ratio,aid_ts_max,aid_ts_min,aid_ts_mean
0,433024,15,1552.0,1425.0,327.536476,0.0,0.0,0.133333,0.4,0.2,...,0.0,14,1,0,0.071429,0.0,0.0,1660831365,1659644168,1660261000.0
1,1367552,188,1431.43617,1436.5,459.042483,0.079787,0.143617,0.085106,0.148936,0.170213,...,0.005319,170,16,2,0.094118,0.011765,0.125,1661670899,1659378849,1660688000.0
2,566528,7,1332.285714,1263.0,448.759295,0.0,0.142857,0.142857,0.142857,0.142857,...,0.0,5,1,1,0.2,0.2,1.0,1660839815,1659724192,1660372000.0
3,1406976,2,1501.5,1501.5,549.421969,0.0,0.0,0.0,0.0,0.5,...,0.0,2,0,0,0.0,0.0,-1.0,1660388918,1659725652,1660057000.0
4,1333120,6,833.166667,613.5,782.116466,0.333333,0.166667,0.0,0.0,0.5,...,0.0,6,0,0,0.0,0.0,-1.0,1660903120,1659713564,1660336000.0


(1844284, 4)


Unnamed: 0,aid,aid_ca_cl_ratio,aid_or_cl_ratio,aid_or_ca_ratio
0,433024,0.071429,0.0,0.0
1,1367552,0.094118,0.011765,0.125
2,566528,0.2,0.2,1.0
3,1406976,0.0,0.0,-1.0
4,1333120,0.0,0.0,-1.0


In [37]:
item_features = (
    item_features
    .merge(test_item_features, on='aid', how='left')
    .fillna(0)
)
del test_item_features
gc.collect()

print(item_features.columns)

Index(['aid', 'aid_ca_cl_ratio', 'aid_or_cl_ratio', 'aid_or_ca_ratio',
       'aid_test_num_clicks', 'aid_test_num_carts', 'aid_test_num_orders',
       'aid_test_min_ts', 'aid_test_max_ts', 'aid_test_num_actions'],
      dtype='object')


# Interaction features

In [41]:
%%time
if VALIDA:
    _dir = '/kaggle/input/otto-validation-candidates/'
else:
    _dir = '/kaggle/input/otto-test-candidates/'
    
for _type in ['click','cart','order']:
    # import history candidates from covi
    candidates = pd.read_parquet(_dir+f"{_type}s_candidates.pqt")

    history_candidates = (
        candidates.query('type_candidate == 1')
        .rename(columns={'score':'his_covi_score'})
        .reset_index(drop=True)
        .merge(his_df, on=['session','aid'], how='left')
    )
    
    common_test_cands = (
        candidates.query('type_candidate == 2')
        .rename(columns={'score':'his_covi_score'})
        .reset_index(drop=True)
    )
    del candidates
    _ = gc.collect()
    
    if VALIDA:
        chunk_list = range(5) #[0,4] if USE_ONLY_GT_SESSION else [0,2]
        for chunk in chunk_list:
            print(f"{_type} | CHUNK {chunk}")
            if _type == 'click' or USE_ONLY_GT_SESSION is False:
                sub_his_cands = history_candidates.loc[history_candidates.session % NUM_CHUNK == chunk].reset_index(drop=True)
                sub_common_test_cands = common_test_cands.loc[common_test_cands.session % NUM_CHUNK == chunk].reset_index(drop=True)
            else:
                chunk_buy = 0 if chunk == 0 else 1
                sub_his_cands = history_candidates.loc[history_candidates.session % 2 == chunk_buy].reset_index(drop=True)
                sub_common_test_cands = common_test_cands.loc[common_test_cands.session % 2 == chunk_buy].reset_index(drop=True)
            
            candidates = pd.concat([sub_his_cands, sub_common_test_cands], 
                                   ignore_index=True, axis = 0).fillna(0).sort_values(by=['session'], ignore_index=True)

            del sub_his_cands, sub_common_test_cands
            gc.collect()

            # read labels
            gt_df = label_df.loc[label_df.type == _type+"s"]
            gt_df = gt_df.explode('ground_truth')
            gt_df = gt_df.drop(columns='type').rename(columns={'ground_truth':'aid'})
            gt_df['gt'] = 1

            # merge gt
            print("Number of session:", candidates.session.nunique())
            session_df = candidates[['session']].drop_duplicates()
            candidates = candidates.merge(session_df, on=['session'], how='inner')
            print("Number of session:", candidates.session.nunique())
            candidates = candidates.merge(gt_df, on=['session','aid'], how='left').fillna(0)

            # remove sessions that have no gt
            if USE_ONLY_GT_SESSION:
                candidates['have_gt'] = candidates.groupby(['session']).gt.transform(max)
                candidates = candidates.loc[candidates.have_gt > 0]
                candidates.drop(columns=['have_gt'], inplace=True)
                print("Number of session:", candidates.session.nunique())
            candidates = candidates.merge(item_features, on=['aid'], how='left').merge(session_features, on=['session'], how='left').fillna(0)
            print("Shape:", candidates.shape)
            candidates.to_parquet(f"/kaggle/working/{_type}_features_{chunk}.pqt")

            del gt_df, candidates
            _ = gc.collect()
    else:   
        for chunk in range(NUM_CHUNK):
            print(f"{_type} | CHUNK {chunk}")

            sub_his_cands = history_candidates.loc[history_candidates.session % NUM_CHUNK == chunk]
            candidates = pd.concat([sub_his_cands, sub_pot_cands.rename(columns={'aid_y':'aid'})], 
                                   ignore_index=True, axis = 0).sort_values(by=['session'])

            del sub_pot_cands, sub_his_cands
            gc.collect()

            candidates = candidates.merge(item_features, on=['aid'], how='left').merge(session_features, on=['session'], how='left').fillna(0)
            candidates.to_parquet(f"/kaggle/working/{_type}_features_{chunk}.pqt")

            del candidates
            gc.collect()

    del history_candidates, common_test_cands
    _ = gc.collect()

click | CHUNK 0
Number of session: 360250
Number of session: 360250
Number of session: 113811
Shape: (363559, 23)
click | CHUNK 1
Number of session: 360250
Number of session: 360250
Number of session: 113834
Shape: (360979, 23)
click | CHUNK 2
Number of session: 360250
Number of session: 360250
Number of session: 113018
Shape: (360934, 23)
click | CHUNK 3
Number of session: 360251
Number of session: 360251
Number of session: 113975
Shape: (359851, 23)
click | CHUNK 4
Number of session: 360250
Number of session: 360250
Number of session: 113949
Shape: (363883, 23)
cart | CHUNK 0
Number of session: 900626
Number of session: 900626
Number of session: 81720
Shape: (404598, 23)
cart | CHUNK 1
Number of session: 900625
Number of session: 900625
Number of session: 81379
Shape: (401005, 23)
cart | CHUNK 2
Number of session: 900625
Number of session: 900625
Number of session: 81379
Shape: (401005, 23)
cart | CHUNK 3
Number of session: 900625
Number of session: 900625
Number of session: 81379
Sh

# Compute metric (VALIDATION)

In [43]:
%%time
chunk_list = range(5) #[0,4] if USE_ONLY_GT_SESSION else [0,2]
for chunk in chunk_list:
    print("chunk ", chunk)
    types = ['click','cart','order']
    candidates = dict()
    for _type in types:
        cans = pd.read_parquet(f"/kaggle/working/{_type}_features_{chunk}.pqt")
        candidates[_type+"s"] = cans[['session','aid']].groupby('session').agg({'aid': lambda x: list(x)}).reset_index().rename(columns={'aid':'labels'})
    full_test_labels = pd.read_parquet('/kaggle/input/otto-validation/test_labels.parquet')

    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = candidates[t]
        test_labels = full_test_labels.loc[full_test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='inner', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)

        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)
    print("Finall recall:", score)
    
    del candidates, full_test_labels, test_labels
    gc.collect()

chunk  0
clicks recall = 1.0
carts recall = 0.5925925925925926
orders recall = 0.7171485215208089
Finall recall: 0.7080668906902632
chunk  1
clicks recall = 1.0
carts recall = 0.5962373562650325
orders recall = 0.7200960251142606
Finall recall: 0.7109288219480661
chunk  2
clicks recall = 1.0
carts recall = 0.5962373562650325
orders recall = 0.7200960251142606
Finall recall: 0.7109288219480661
chunk  3
clicks recall = 1.0
carts recall = 0.5962373562650325
orders recall = 0.7200960251142606
Finall recall: 0.7109288219480661
chunk  4
clicks recall = 1.0
carts recall = 0.5962373562650325
orders recall = 0.7200960251142606
Finall recall: 0.7109288219480661
CPU times: user 1min 30s, sys: 4.27 s, total: 1min 34s
Wall time: 1min 30s
