In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

import lightgbm as lgb

from tqdm import tqdm, trange

In [4]:
DATA_DIR = 'data/'
X_TRAIN_PATH = DATA_DIR + 'X_train.csv.gz'
Y_TRAIN_PATH = DATA_DIR + 'y_train.csv.gz'
X_TEST_PATH = DATA_DIR + 'X_test.csv.gz'
AGG_DATA_PATH = DATA_DIR + 'aggregated_noisy_data_pairs.csv.gz'
AGG_DATA_SINGLE_PATH = DATA_DIR + 'aggregated_noisy_data_singles.csv.gz'


In [59]:
Xy_agg_data_singles = pd.read_csv(
    AGG_DATA_SINGLE_PATH, compression='gzip', 
    dtype={'feature_1_value': str, 'feature_1_id': int, 'count': float, 'nb_clicks': float, 'nb_sales': float}
)
Xy_agg_data = pd.read_csv(
    AGG_DATA_PATH, compression='gzip', 
    dtype={'feature_1_value': str, 'feature_1_id': int, 'feature_2_value': str, 'feature_2_id': int,
           'count': float, 'nb_clicks': float, 'nb_sales': float}
)

X_train = pd.read_csv(X_TRAIN_PATH, compression='gzip', dtype=str)
y_train = pd.read_csv(Y_TRAIN_PATH, compression='gzip')
X_test = pd.read_csv(X_TEST_PATH, compression='gzip', dtype=str)


In [61]:
features = Xy_agg_data_singles['feature_1_id'].unique()
stats = {'count', 'clicks', 'sales', 'freq'}

stat_bins = {}
for stat in stats:
    stat_bins[stat] = {feature_id: {} for feature_id in features}

for feature in features:
    stat_bins['freq'][feature] = X_train[f'hash_{feature}'].value_counts().to_dict()
    
for k, row in Xy_agg_data_singles.iterrows():
    feature_index = row['feature_1_id']
    value_index = row['feature_1_value']

    stat_bins['count'][feature_index][value_index] = row['count']
    stat_bins['clicks'][feature_index][value_index] = row['nb_clicks']
    stat_bins['sales'][feature_index][value_index] = row['nb_sales']
    

In [62]:
for f in trange(0, 19):
    for stat in stats:       
        X_train[f'feature{f}_{stat}'] = X_train[f'hash_{f}'].map(stat_bins[stat][f])
        X_test[f'feature{f}_{stat}'] = X_test[f'hash_{f}'].map(stat_bins[stat][f])


100%|██████████| 19/19 [00:12<00:00,  1.57it/s]


In [64]:
pair_stat_bins = {}
for stat in stats:
    pair_stat_bins[stat] = {(f"hash_{feature_1_id}", f"hash_{feature_2_id}"): {} for feature_1_id in features for feature_2_id in features if feature_1_id < feature_2_id}

for feature_1_id in features:
    for feature_2_id in features:
        if feature_1_id < feature_2_id:
            features_index = (f"hash_{feature_1_id}", f"hash_{feature_2_id}")
            pair_stat_bins['freq'][features_index] = X_train[list(features_index)].value_counts().to_dict()

for k, row in tqdm(Xy_agg_data.iterrows()):
    features_index = (f"hash_{row['feature_1_id']}", f"hash_{row['feature_2_id']}")
    values_index = (row['feature_1_value'], row['feature_2_value'])

    pair_stat_bins['count'][features_index][values_index] = row['count']
    pair_stat_bins['clicks'][features_index][values_index] = row['nb_clicks']
    pair_stat_bins['sales'][features_index][values_index] = row['nb_sales']



18432452it [40:34, 7572.22it/s] 


In [66]:
for f1 in trange(19):
    for f2 in trange(19):
        if f1 < f2:
            hash_index = [f'hash_{f1}', f'hash_{f2}']
            feature_prefix = f'feature{f1}_feature{f2}'
            for stat in stats:       
                X_train[f'{feature_prefix}_{stat}'] = X_train[hash_index].apply(tuple, axis=1).map(pair_stat_bins[stat][tuple(hash_index)])
                X_test[f'{feature_prefix}_{stat}'] = X_test[hash_index].apply(tuple, axis=1).map(pair_stat_bins[stat][tuple(hash_index)])


  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/19 [00:00<?, ?it/s][A
 11%|█         | 2/19 [01:08<09:43, 34.32s/it][A
 16%|█▌        | 3/19 [02:10<12:15, 46.00s/it][A
 21%|██        | 4/19 [03:16<13:20, 53.34s/it][A
 26%|██▋       | 5/19 [04:27<13:50, 59.30s/it][A
 32%|███▏      | 6/19 [05:33<13:19, 61.52s/it][A
 37%|███▋      | 7/19 [06:41<12:43, 63.59s/it][A
 42%|████▏     | 8/19 [07:41<11:28, 62.63s/it][A
 47%|████▋     | 9/19 [08:45<10:31, 63.14s/it][A
 53%|█████▎    | 10/19 [09:50<09:31, 63.55s/it][A
 58%|█████▊    | 11/19 [11:05<08:56, 67.08s/it][A
 63%|██████▎   | 12/19 [12:15<07:55, 67.95s/it][A
 68%|██████▊   | 13/19 [13:29<06:58, 69.67s/it][A
 74%|███████▎  | 14/19 [14:38<05:48, 69.69s/it][A
 79%|███████▉  | 15/19 [16:02<04:55, 73.79s/it][A
 84%|████████▍ | 16/19 [17:08<03:35, 71.68s/it][A
 89%|████████▉ | 17/19 [18:16<02:20, 70.44s/it][A
 95%|█████████▍| 18/19 [19:33<01:12, 72.32s/it][A
100%|██████████| 19/19 [20:36<00:00, 65.10s/it][A
  5%|▌   

In [67]:
X_train.to_csv('output/X_train_full.csv', index=False)
X_test.to_csv('output/X_test_full.csv', index=False)
