In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import scipy
from tqdm.auto import tqdm, trange
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_validate
import zipfile

from src.model import tscv, ClippedOutputRegressor
from src.data import get_feature_cols, df_to_X_y, drop_non_features, add_lagged_features


%run constants.py

baseline_reg = joblib.load(os.path.join(MODELS_DIR, 'xgb-baseline.model'))

%matplotlib inline
print("Versions:")
print("  Python: %s" % sys.version)
for module in [pd, np, sns, sklearn]:
    print("  %s: %s" %(module.__name__, module.__version__))

Versions:
  Python: 3.8.2 (default, Jul 16 2020, 14:00:26) 
[GCC 9.3.0]
  pandas: 1.1.1
  numpy: 1.19.1
  seaborn: 0.10.1
  sklearn: 0.23.2


For linear models we have to standardize the features and one-hot encode categorical features.

Let's start with one-hot encoding. All features were created with a prefix `f__` and, when they are categorical features `f__cat__`, so this makes it easy for us to build the encoder.

In [45]:
def make_encoder_step(train_set, test_set):
    features = get_feature_cols(train_set)

    categories = []
    indexes = []

    for i, feature in enumerate(features):
        if feature.startswith('f__cat__'):
            cats = np.union1d(train_set[feature], test_set[feature]).tolist()
            categories.append(cats)
            indexes.append(i)

    return ColumnTransformer(
        [("onehot", OneHotEncoder(categories=categories), indexes)],
        remainder='passthrough')

In [13]:
train_set_008 = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'train-set-features-008.parquet'))
test_set_008 = pd.read_parquet(os.path.join(PROCESSED_DATA_DIR, 'test-set-features-008.parquet'))

In [14]:
train_set_008.describe()

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_month,f__cat__item_id,f__cat__shop_id,f__cat__date_block_num,month_id,year_id,f__cat__month_id,...,f__item_cnt_15,f__item_cnt_16,f__item_cnt_18,f__item_cnt_20,f__item_cnt_24,f__item_cnt_26,f__item_cnt_28,f__item_cnt_30,f__item_cnt_31,f__item_cnt_32
count,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,...,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0,4284000.0
mean,11019.4,31.64286,23.5,0.2267862,11019.4,31.64286,23.5,5.5,1.5,5.5,...,0.1662414,0.1559412,0.1355292,0.115436,0.06934267,0.04976867,0.03522526,0.02219374,0.01736088,0.009984827
std,6252.631,17.56189,5.766282,1.10505,6252.631,17.56189,5.766282,3.041382,0.5,3.041382,...,3.117029,3.060134,2.915083,2.754693,2.054427,1.540952,1.14973,0.5738286,0.5325478,0.3434243
min,30.0,2.0,14.0,0.0,30.0,2.0,14.0,0.0,1.0,0.0,...,-4.0,-4.0,-4.0,-4.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
25%,5381.5,16.0,18.75,0.0,5381.5,16.0,18.75,3.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11203.0,34.5,23.5,0.0,11203.0,34.5,23.5,5.5,1.5,5.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16071.5,47.0,28.25,0.0,16071.5,47.0,28.25,8.0,2.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,22167.0,59.0,33.0,20.0,22167.0,59.0,33.0,11.0,2.0,11.0,...,1305.0,1305.0,1305.0,1305.0,978.0,820.0,766.0,259.0,259.0,169.0


In [46]:
trans = make_encoder_step(train_set_008, test_set_008)
trans

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(categories=[[30, 31, 32, 33, 38,
                                                            42, 45, 51, 53, 57,
                                                            59, 62, 64, 67, 69,
                                                            70, 72, 76, 80, 83,
                                                            88, 91, 93, 109,
                                                            138, 140, 149, 153,
                                                            154, 166, ...],
                                                           [2, 3, 4, 5, 6, 7,
                                                            10, 12, 14, 15, 16,
                                                            18, 19, 21, 22, 24,
                                                            25, 26, 28, 31, 34,
                                           

In [34]:
Xtrain, ytrain = df_to_X_y(train_set_008)

In [47]:
trans.fit_transform(Xtrain, ytrain)

<4284000x5288 sparse matrix of type '<class 'numpy.float64'>'
	with 117215571 stored elements in Compressed Sparse Row format>