In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import bisect
import gc
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from utils.misc import *

In [3]:
DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')
USE_QUIZ_SET = False
USE_TEST_SET = False

In [4]:
if USE_QUIZ_SET:
    train_dataset_type = 'train+valid+test'
    test_dataset_type = 'quiz'
    
elif USE_TEST_SET:
    train_dataset_type = 'train+valid'
    test_dataset_type = 'test'
    
else: 
    train_dataset_type = 'train'
    test_dataset_type = 'valid'

In [5]:
df_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', train_dataset_type]) + '.pkl'))

In [6]:
target_name = 'label'
num_feature_names = df_train.columns[df_train.columns.str.startswith('I')]
cat_feature_names = df_train.columns[df_train.columns.str.startswith('C')]
all_feature_names = pd.Index(num_feature_names.to_list() + cat_feature_names.to_list())

In [7]:
print('# of num features:', len(num_feature_names), '\n# of cat features:', len(cat_feature_names))

# of num features: 13 
# of cat features: 26


In [8]:
num_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=0.0, copy=False), 
                             StandardScaler(copy=False))
cat_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value='<unknown>', copy=False), 
                             OrdinalEncoder(dtype=np.int))
full_pipeline = make_column_transformer((num_pipeline, num_feature_names), (cat_pipeline, cat_feature_names))

In [9]:
df_y_train = df_train[target_name]
df_y_train.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))

In [None]:
full_pipeline = full_pipeline.fit(df_train[all_feature_names])

for i in range(len(full_pipeline.transformers_[1][2])):
    categories = set(full_pipeline.transformers_[1][1].steps[1][1].categories_[i])
    if '<unknown>' not in categories:
        categories = list(categories)
        bisect.insort_left(categories, '<unknown>')
        full_pipeline.transformers_[1][1].steps[1][1].categories_[i] = np.array(categories)
        
n_categories = {feature: len(categories) for feature, categories in zip(
    full_pipeline.transformers_[1][2], full_pipeline.transformers_[1][1].steps[1][1].categories_)}

In [10]:
with get_elapsed_time():
    df_X_train = full_pipeline.transform(df_train[all_feature_names])
    df_X_train = pd.DataFrame(df_X_train, columns=all_feature_names)
    df_X_train = df_X_train.astype({feature_name: 'int' for feature_name in cat_feature_names}, copy=False)
    df_X_train.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))

Elapsed time: 4815 sec


In [12]:
dump_pickle(os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'pipeline.pkl']), full_pipeline)
dump_pickle(os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])), 
            (target_name, num_feature_names, cat_feature_names, n_categories))

In [13]:
del df_train, df_y_train, df_X_train
_ = gc.collect()

In [14]:
df_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', test_dataset_type]) + '.pkl'))

In [15]:
for i, feature in enumerate(full_pipeline.transformers_[1][2]):
    categories = set(full_pipeline.transformers_[1][1].steps[1][1].categories_[i])
    df_test[feature] = df_test[feature].map(lambda x: np.nan if x not in categories else x)

In [16]:
if not USE_QUIZ_SET:
    df_y_test = df_test[target_name]
    df_y_test.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl'))

In [17]:
with get_elapsed_time():
    df_X_test = full_pipeline.transform(df_test)
    df_X_test = pd.DataFrame(df_X_test, columns=all_feature_names)
    df_X_test = df_X_test.astype({feature_name: 'int' for feature_name in cat_feature_names}, copy=False)
    df_X_test.to_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))

Elapsed time: 126 sec


In [18]:
if not USE_QUIZ_SET:
    del df_y_test, df_X_test
    
else:
    del df_X_test
_ = gc.collect()