# Definitions

In [1]:
from pathlib import Path
from itertools import product

from tqdm.notebook import tqdm

import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from category_encoders import OrdinalEncoder

In [2]:
data_path = Path('..', 'data')

In [3]:
def trunc_to_week(dt_series):
    return pd.PeriodIndex(dt_series, freq='W-Sun')

# Data

In [4]:
train = pd.read_csv(data_path.joinpath('train_data.csv'))

In [5]:
train.head()

Unnamed: 0,id,timestamp,rto_day,traffic,region_nm
0,1,2020-08-13,117135.29,638.0,Аваллонэ
1,1,2020-11-26,116102.2,619.0,Аваллонэ
2,2,2021-01-13,102750.37,461.0,Валимар
3,3,2019-07-03,100223.2,603.0,Ильмарин
4,3,2020-12-10,126296.59,624.0,Ильмарин


In [6]:
train.shape

(818000, 5)

In [7]:
train.timestamp = pd.to_datetime(train.timestamp)

In [8]:
train.sort_values(['timestamp', 'id'], inplace=True)

# Model

## Data Preparation

### Aggregates by week-id

In [9]:
aggregates = ['sum', 'min', 'max', 'median', 'std', 'mean']

In [10]:
agg_train = train.groupby([trunc_to_week(train.timestamp), train.id, train.region_nm]).agg(aggregates)

In [11]:
agg_train.columns = ['_'.join(pack) for pack in agg_train.columns]

In [12]:
train = agg_train.reset_index()
train.head()

Unnamed: 0,timestamp,id,region_nm,rto_day_sum,rto_day_min,rto_day_max,rto_day_median,rto_day_std,rto_day_mean,traffic_sum,traffic_min,traffic_max,traffic_median,traffic_std,traffic_mean
0,2018-12-31/2019-01-06,1,Аваллонэ,467010.11,40373.99,104888.52,81102.765,21557.315436,77835.018333,2748.0,254.0,547.0,487.5,106.517604,458.0
1,2018-12-31/2019-01-06,2,Валимар,470225.13,28781.79,106803.71,80882.96,27648.321269,78370.855,2633.0,202.0,543.0,465.0,123.46403,438.833333
2,2018-12-31/2019-01-06,3,Ильмарин,378291.98,30417.66,86967.99,63179.025,20758.499201,63048.663333,2395.0,218.0,481.0,427.0,100.603015,399.166667
3,2018-12-31/2019-01-06,4,Тирион,418346.13,22861.65,98327.38,75424.1,27535.567461,69724.355,1620.0,145.0,326.0,294.0,67.198214,270.0
4,2018-12-31/2019-01-06,5,Аваллонэ,445107.74,39071.07,94935.51,78145.88,19070.442008,74184.623333,2768.0,265.0,550.0,498.5,103.88391,461.333333


In [13]:
train.timestamp = pd.to_datetime(train.timestamp.astype(str).str[:10])

In [14]:
train.head()

Unnamed: 0,timestamp,id,region_nm,rto_day_sum,rto_day_min,rto_day_max,rto_day_median,rto_day_std,rto_day_mean,traffic_sum,traffic_min,traffic_max,traffic_median,traffic_std,traffic_mean
0,2018-12-31,1,Аваллонэ,467010.11,40373.99,104888.52,81102.765,21557.315436,77835.018333,2748.0,254.0,547.0,487.5,106.517604,458.0
1,2018-12-31,2,Валимар,470225.13,28781.79,106803.71,80882.96,27648.321269,78370.855,2633.0,202.0,543.0,465.0,123.46403,438.833333
2,2018-12-31,3,Ильмарин,378291.98,30417.66,86967.99,63179.025,20758.499201,63048.663333,2395.0,218.0,481.0,427.0,100.603015,399.166667
3,2018-12-31,4,Тирион,418346.13,22861.65,98327.38,75424.1,27535.567461,69724.355,1620.0,145.0,326.0,294.0,67.198214,270.0
4,2018-12-31,5,Аваллонэ,445107.74,39071.07,94935.51,78145.88,19070.442008,74184.623333,2768.0,265.0,550.0,498.5,103.88391,461.333333


### Shifts and rollin'

In [15]:
N_window = 10
cols_to_shift = agg_train.columns

In [16]:
for i in range(1, N_window + 1):
    for col in cols_to_shift:
        train[col+f'_{i}'] = train.groupby('id')[col].shift(i)

In [17]:
%%time
for i in range(2, N_window + 1):
    roll_train = train.groupby('id')[cols_to_shift].rolling(i).agg(aggregates)
    roll_train.columns = [f'_roll{i}_id_'.join(pack) for pack in roll_train]
    train[roll_train.columns] = roll_train.reset_index(drop=True)

CPU times: user 4min 23s, sys: 7.4 s, total: 4min 31s
Wall time: 4min 33s


In [18]:
train.shape

(117000, 783)

In [19]:
train = train.drop(train.columns[train.notna().sum() == 0], axis=1)

### targets

In [20]:
for i in range(1, 5):
    train[f'y_{i}'] = train.groupby('id').rto_day_sum.shift(-i)

In [21]:
train[train['id'] == 1]

Unnamed: 0,timestamp,id,region_nm,rto_day_sum,rto_day_min,rto_day_max,rto_day_median,rto_day_std,rto_day_mean,traffic_sum,...,traffic_mean_roll10_id_sum,traffic_mean_roll10_id_min,traffic_mean_roll10_id_max,traffic_mean_roll10_id_median,traffic_mean_roll10_id_std,traffic_mean_roll10_id_mean,y_1,y_2,y_3,y_4
0,2018-12-31,1,Аваллонэ,467010.11,40373.99,104888.52,81102.765,21557.315436,77835.018333,2748.0,...,,,,,,,686159.59,723778.15,707086.52,676487.97
1000,2019-01-07,1,Аваллонэ,686159.59,86364.37,110445.79,98643.730,8278.254464,98022.798571,3932.0,...,2292.000000,207.428571,266.000000,223.428571,19.646098,229.200000,723778.15,707086.52,676487.97,716537.17
2000,2019-01-14,1,Аваллонэ,723778.15,89761.80,117037.51,101985.250,10210.920399,103396.878571,4261.0,...,4706.857143,450.857143,488.428571,473.000000,11.143834,470.685714,707086.52,676487.97,716537.17,721202.65
3000,2019-01-21,1,Аваллонэ,707086.52,92600.56,108242.18,98412.420,6138.624346,101012.360000,4106.0,...,4490.714286,420.571429,484.857143,446.214286,20.084092,449.071429,676487.97,716537.17,721202.65,731562.44
4000,2019-01-28,1,Аваллонэ,676487.97,87147.16,104319.71,96989.430,6704.905268,96641.138571,3935.0,...,3652.000000,314.142857,411.428571,369.214286,31.232375,365.200000,716537.17,721202.65,731562.44,688507.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112000,2021-02-22,1,Аваллонэ,1031926.10,115159.72,180857.52,147722.970,22297.962595,147418.014286,4238.0,...,4157.857143,309.428571,467.285714,411.642857,45.477724,415.785714,1044055.54,972112.55,1024182.15,1007998.40
113000,2021-03-01,1,Аваллонэ,1044055.54,128828.70,170082.25,148302.740,16430.399164,149150.791429,4623.0,...,2793.857143,262.000000,294.428571,281.214286,9.018512,279.385714,972112.55,1024182.15,1007998.40,
114000,2021-03-08,1,Аваллонэ,972112.55,128529.52,152667.25,138640.670,8662.496834,138873.221429,4341.0,...,9135.142857,825.857143,950.285714,925.214286,38.614098,913.514286,1024182.15,1007998.40,,
115000,2021-03-15,1,Аваллонэ,1024182.15,131493.34,160425.55,144470.580,10158.362267,146311.735714,4492.0,...,3104.000000,243.857143,333.285714,322.857143,30.822717,310.400000,1007998.40,,,


In [22]:
x_cols = train.columns[~train.columns.str.startswith('y_')]
x_cols = x_cols[1:]
x_cols

Index(['id', 'region_nm', 'rto_day_sum', 'rto_day_min', 'rto_day_max',
       'rto_day_median', 'rto_day_std', 'rto_day_mean', 'traffic_sum',
       'traffic_min',
       ...
       'traffic_std_roll10_id_max', 'traffic_std_roll10_id_median',
       'traffic_std_roll10_id_std', 'traffic_std_roll10_id_mean',
       'traffic_mean_roll10_id_sum', 'traffic_mean_roll10_id_min',
       'traffic_mean_roll10_id_max', 'traffic_mean_roll10_id_median',
       'traffic_mean_roll10_id_std', 'traffic_mean_roll10_id_mean'],
      dtype='object', length=782)

In [23]:
cat_cols = x_cols[:2]

## Test Routine

In [24]:
Xy = train[x_cols.union(['timestamp', 'y_1'])].dropna()
Xy.head()

Unnamed: 0,id,region_nm,rto_day_max,rto_day_max_1,rto_day_max_10,rto_day_max_2,rto_day_max_3,rto_day_max_4,rto_day_max_5,rto_day_max_6,...,traffic_sum_roll8_id_min,traffic_sum_roll8_id_std,traffic_sum_roll8_id_sum,traffic_sum_roll9_id_max,traffic_sum_roll9_id_mean,traffic_sum_roll9_id_median,traffic_sum_roll9_id_min,traffic_sum_roll9_id_std,traffic_sum_roll9_id_sum,y_1
10000,1,Аваллонэ,128286.3,126755.87,104888.52,111453.61,119860.4,115559.33,108297.48,104319.71,...,2398.0,95.363122,19879.0,2619.0,2464.111111,2415.0,2298.0,108.800786,22177.0,758240.1
10001,2,Валимар,111867.21,156727.4,106803.71,101392.9,116440.6,104756.31,102225.77,104927.83,...,2376.0,100.313793,19840.0,2619.0,2472.777778,2415.0,2376.0,96.303917,22255.0,718593.91
10002,3,Ильмарин,94669.92,111699.43,86967.99,97003.13,91963.81,98301.14,91829.6,93383.4,...,2338.0,93.665724,19559.0,2619.0,2464.222222,2405.0,2338.0,105.097309,22178.0,572710.62
10003,4,Тирион,126704.1,131662.83,98327.38,115617.7,129070.86,118436.5,128661.53,124153.15,...,2338.0,95.522623,19456.0,2605.0,2437.111111,2401.0,2338.0,90.659313,21934.0,826448.47
10004,5,Аваллонэ,114316.88,112685.97,94935.51,117270.26,106767.67,103619.68,111793.83,102883.17,...,2338.0,67.772598,19205.0,2605.0,2423.333333,2398.0,2338.0,93.059121,21810.0,733890.25


In [25]:
train.shape, Xy.shape

((117000, 787), (97846, 784))

## Category Encoding

In [26]:
cols_to_enc = cat_cols.difference(['id'])

In [27]:
cat_enc = OrdinalEncoder(cols=cols_to_enc)

In [28]:
Xy[cols_to_enc] = cat_enc.fit_transform(Xy[cols_to_enc])

  elif pd.api.types.is_categorical(cols):


In [29]:
Xy.head()

Unnamed: 0,id,region_nm,rto_day_max,rto_day_max_1,rto_day_max_10,rto_day_max_2,rto_day_max_3,rto_day_max_4,rto_day_max_5,rto_day_max_6,...,traffic_sum_roll8_id_min,traffic_sum_roll8_id_std,traffic_sum_roll8_id_sum,traffic_sum_roll9_id_max,traffic_sum_roll9_id_mean,traffic_sum_roll9_id_median,traffic_sum_roll9_id_min,traffic_sum_roll9_id_std,traffic_sum_roll9_id_sum,y_1
10000,1,1,128286.3,126755.87,104888.52,111453.61,119860.4,115559.33,108297.48,104319.71,...,2398.0,95.363122,19879.0,2619.0,2464.111111,2415.0,2298.0,108.800786,22177.0,758240.1
10001,2,2,111867.21,156727.4,106803.71,101392.9,116440.6,104756.31,102225.77,104927.83,...,2376.0,100.313793,19840.0,2619.0,2472.777778,2415.0,2376.0,96.303917,22255.0,718593.91
10002,3,3,94669.92,111699.43,86967.99,97003.13,91963.81,98301.14,91829.6,93383.4,...,2338.0,93.665724,19559.0,2619.0,2464.222222,2405.0,2338.0,105.097309,22178.0,572710.62
10003,4,4,126704.1,131662.83,98327.38,115617.7,129070.86,118436.5,128661.53,124153.15,...,2338.0,95.522623,19456.0,2605.0,2437.111111,2401.0,2338.0,90.659313,21934.0,826448.47
10004,5,1,114316.88,112685.97,94935.51,117270.26,106767.67,103619.68,111793.83,102883.17,...,2338.0,67.772598,19205.0,2605.0,2423.333333,2398.0,2338.0,93.059121,21810.0,733890.25


## Data Divide

In [30]:
# es_threshold = '2020-07-01'
test_threshold = '2021-01-01'

In [31]:
# xy_train = Xy[Xy.timestamp < es_threshold]
xy_train = Xy[Xy.timestamp < test_threshold]
xy_train.shape[0] / Xy.shape[0]

0.8962246795985528

In [32]:
xy_test = Xy[Xy.timestamp >= test_threshold]
xy_test.shape[0] / Xy.shape[0]

0.10377532040144717

## Model

In [33]:
model = LGBMRegressor()

In [34]:
int(0.001*len(Xy))

97

In [35]:
params = {'objective': 'mae',
          'bagging_fraction': 1,
          'bagging_freq': 10,
          'feature_fraction': 1,
          'learning_rate': 0.1,
          'max_depth': 15,
          'min_data_in_leaf': int(0.001*len(Xy)),
          'n_estimators': 10_000,
          'n_jobs': 4,
          'num_leaves': 250,
          'reg_alpha': 1,
          'reg_lambda': 1}

In [36]:
model.set_params(**params)

LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
              max_depth=15, min_data_in_leaf=97, n_estimators=10000, n_jobs=4,
              num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1)

In [37]:
def get_xy(df, y_name):
    return df[x_cols], df[y_name]

In [38]:
%%time
model.fit(*get_xy(xy_train, 'y_1'),
#           eval_set=get_xy(xy_es, 'y_1'),
          eval_set=get_xy(xy_test, 'y_1'),
          eval_metric='mae',
          early_stopping_rounds=50,
          categorical_feature=list(cat_cols),
          verbose=50)

New categorical_feature is ['id', 'region_nm']






Training until validation scores don't improve for 50 rounds
[50]	valid_0's l1: 45784.4
[100]	valid_0's l1: 44103.4
[150]	valid_0's l1: 43734.3
[200]	valid_0's l1: 43694.9
[250]	valid_0's l1: 43655.3
Early stopping, best iteration is:
[232]	valid_0's l1: 43634.1
CPU times: user 9min 55s, sys: 11.1 s, total: 10min 7s
Wall time: 2min 53s


LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
              max_depth=15, min_data_in_leaf=97, n_estimators=10000, n_jobs=4,
              num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1)

# Fit Routine

In [39]:
y_list = [f'y_{i}' for i in range(1, 5)]

In [40]:
X = Xy[x_cols]

In [41]:
models_dict = {}
for y_name in y_list:
    Xy = train[x_cols.union([y_name])].dropna()
    Xy[cols_to_enc] = cat_enc.transform(Xy[cols_to_enc])
    X = Xy[x_cols]
    y = Xy[y_name]
    
    model = LGBMRegressor()
    model.set_params(**params)
    model.set_params(n_estimators=200)
    
    model.fit(X, y,
              categorical_feature=list(cat_cols),
              verbose=50)
    models_dict[y_name] =  model



In [42]:
models_dict

{'y_1': LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
               max_depth=15, min_data_in_leaf=97, n_estimators=200, n_jobs=4,
               num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1),
 'y_2': LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
               max_depth=15, min_data_in_leaf=97, n_estimators=200, n_jobs=4,
               num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1),
 'y_3': LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
               max_depth=15, min_data_in_leaf=97, n_estimators=200, n_jobs=4,
               num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1),
 'y_4': LGBMRegressor(bagging_fraction=1, bagging_freq=10, feature_fraction=1,
               max_depth=15, min_data_in_leaf=97, n_estimators=200, n_jobs=4,
               num_leaves=250, objective='mae', reg_alpha=1, reg_lambda=1)}

# Predict

In [49]:
train.shape

(117000, 787)

In [52]:
X_test = train[train.timestamp == train.timestamp.max()][x_cols]

In [54]:
X_test[cols_to_enc] = cat_enc.transform(X_test[cols_to_enc])

In [55]:
predicts = {}
for y_name, model in tqdm(models_dict.items()):
    predicts[y_name] = model.predict(X_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [62]:
(X_test['id'] == range(1, 1001)).sum()

1000

In [64]:
submit = pd.DataFrame(predicts)
submit.columns = [f'week_{i}' for i in range(1, 5)]
submit.index = submit.index + 1
submit

Unnamed: 0,week_1,week_2,week_3,week_4
1,1.006679e+06,9.733827e+05,9.982409e+05,9.852648e+05
2,7.300206e+05,7.365802e+05,7.550246e+05,7.475365e+05
3,7.816490e+05,7.425467e+05,7.712100e+05,7.546657e+05
4,1.058174e+06,1.065438e+06,1.091694e+06,1.077743e+06
5,8.021008e+05,7.884684e+05,7.998992e+05,7.932437e+05
...,...,...,...,...
996,1.487675e+06,1.428500e+06,1.495353e+06,1.437696e+06
997,7.516927e+05,7.503687e+05,7.511292e+05,7.425037e+05
998,7.759088e+05,7.688865e+05,7.817272e+05,7.651213e+05
999,1.370663e+06,1.372671e+06,1.298568e+06,1.273041e+06


In [65]:
submit.to_csv(data_path.joinpath('lgbm_submit.csv'), index_label='id')