In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import os
import pickle
import pandas as pd
from utils.data import *
from utils.misc import *

In [3]:
DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')
USE_QUIZ_SET = False
USE_TEST_SET = False
USE_FIELD = False
USE_HASH = False
TRAIN_SAMPLING_RATE = 1.0
TEST_SAMPLING_RATE = 1.0

In [4]:
if USE_QUIZ_SET:
    train_dataset_type = 'train+valid+test'
    test_dataset_type = 'quiz'
    
elif USE_TEST_SET:
    train_dataset_type = 'train+valid'
    test_dataset_type = 'test'
    
else: 
    train_dataset_type = 'train'
    test_dataset_type = 'valid'

In [5]:
df_y_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'y', train_dataset_type]) + '.pkl'))
df_X_train = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', train_dataset_type]) + '.pkl'))

In [6]:
df_y_train.index = list(range(df_y_train.shape[0]))
df_X_train.index = list(range(df_X_train.shape[0]))

if TRAIN_SAMPLING_RATE < 1.0:
    df_y_train = df_y_train.sample(frac=TRAIN_SAMPLING_RATE, random_state=42)
    df_X_train = df_X_train.loc[df_y_train.index, :]

In [7]:
print('# of obs in sampled train set:', df_X_train.shape[0])

# of obs in sampled train set: 45840617


In [8]:
full_pipeline = load_pickle(os.path.join(DATA_DIR, '_'.join(['pipeline', train_dataset_type]) + '.pkl'))
target_name, num_feature_names, cat_feature_names, n_categories = load_pickle(
    os.path.join(DATA_DIR, '_'.join([train_dataset_type, 'metadata.pkl'])))

In [9]:
with get_elapsed_time():
    model_type = 'ffm' if USE_FIELD else 'fm'
    train_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', train_dataset_type]) + '.libsvm')
    dump_libsvm_file(df_X_train, df_y_train, train_dataset_path, num_feature_names, cat_feature_names, 
                     n_categories, use_field=USE_FIELD, decimals=6, use_hash=USE_HASH)

Elapsed time: 24827 sec


In [10]:
del df_y_train, df_X_train
_ = gc.collect()

In [11]:
df_y_test = pd.read_pickle(
    os.path.join(DATA_DIR, '_'.join(['df', 'y', test_dataset_type]) + '.pkl')) if not USE_QUIZ_SET else None
df_X_test = pd.read_pickle(os.path.join(DATA_DIR, '_'.join(['df', 'X', test_dataset_type]) + '.pkl'))

In [12]:
if not USE_QUIZ_SET:
    df_y_test.index = list(range(df_y_test.shape[0]))
df_X_test.index = list(range(df_X_test.shape[0]))

if not USE_QUIZ_SET and TEST_SAMPLING_RATE < 1.0:
    df_y_test = df_y_test.sample(frac=TEST_SAMPLING_RATE, random_state=42)
    df_X_test = df_X_test.loc[df_y_test.index, :]

In [13]:
print('# of obs in sampled test set:', df_X_test.shape[0])

# of obs in sampled test set: 6042135


In [14]:
with get_elapsed_time():
    test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.libsvm')
    dump_libsvm_file(df_X_test, df_y_test, test_dataset_path, num_feature_names, cat_feature_names, 
                     n_categories, use_field=USE_FIELD, decimals=8, use_hash=USE_HASH)

In [15]:
if not USE_QUIZ_SET:
    del df_y_test, df_X_test
    
else:
    del df_X_test
_ = gc.collect()