# Section II. Feature Engineering

### Load libraries, functions, palette, theme

In [1]:
import os
from pathlib import Path

In [None]:
# load functions
from __functions import *

In [None]:
# load libraries and palette
%run __libraries.ipynb

In [None]:
# start stopwatch
start = stopwatch_start()

### Variables

In [None]:
dir_current = '02-feature-engineering'

In [None]:
dir_load_files = 'files/'

In [None]:
dir_save_files = 'files/'

In [None]:
dir_save_img = 'docs/img/'

### Load Files

##### Dictionaries

In [None]:
datasets = loadit('datasets', dir=dir_load_files)

##### Datasets

In [None]:
train = extract_variable('train_raw', datasets)

In [None]:
valid = extract_variable('valid_raw', datasets)

In [None]:
test = extract_variable('test_raw', datasets)

## Feature Engineering

In [None]:
train.head(3)

#### Minute of day

In [None]:
%%time
for i in train['day_of_year']:
    leng_i = len(train[train['day_of_year']==i])
    values_ = arange(0, leng_i*10, 10)
    train.loc[train['day_of_year']==i, 'minute_of_day'] = values_

In [None]:
%%time
for i in valid['day_of_year']:
    leng_i = len(valid[valid['day_of_year']==i])
    values_ = arange(0, leng_i*10, 10)
    valid.loc[valid['day_of_year']==i, 'minute_of_day'] = values_

In [None]:
train['minute_of_day']

In [None]:
valid['minute_of_day']

In [None]:
sns.lineplot(
    data=train,
    x='minute_of_day',
    y='target'
)

#### Features 'is_Friday' and 'is_Sunday'

In [None]:
train['is_Friday'] = (train['weekday']==4).astype(int)

In [None]:
train['is_Sunday'] = (train['weekday']==6).astype(int)

In [None]:
valid['is_Friday'] = (valid['weekday']==4).astype(int)

In [None]:
valid['is_Sunday'] = (valid['weekday']==6).astype(int)

#### Feature 'diff_Sunday'

If Sunday:  
- 'diff_Sunday' == difference between mean of target value during ordinary days and mean of target value during sundays for previous month

else:  
- 0

In [None]:
train = generate_feature_diff_sunday(train, 'target')

In [None]:
train_nos = train.loc['2017-07'].loc[train['weekday'] != 6].copy()

In [None]:
train_s = train.loc['2017-07'].loc[train['weekday'] == 6].copy()

In [None]:
diff_Sunday_valid = \
    train_nos.groupby(['hour', 'minute']).mean()['target'] \
    - train_s.groupby(['hour', 'minute']).mean()['target']

In [None]:
valid['diff_Sunday'] = np.NaN

In [None]:
valid_s_number = int(len(valid[valid['weekday']==6]) / 144)        

In [None]:
diff_Sunday_valid = list(diff_Sunday_valid) * valid_s_number

In [None]:
valid.loc[valid['weekday'] == 6, 'diff_Sunday'] = diff_Sunday_valid

In [None]:
valid['diff_Sunday'] = valid['diff_Sunday'].fillna(0)

#### Features 'is_Day'

In [None]:
train['is_Day'] = ((5 < train['hour']) & (train['hour'] < 21)).astype(int)

In [None]:
valid['is_Day'] = ((5 < valid['hour']) & (valid['hour'] < 21)).astype(int)

#### Feature 'is_31'

In [None]:
train['is_31'] = (train['day']==31).astype(int)

In [None]:
valid['is_31'] = (valid['day']==31).astype(int)

#### Feature 'is_20hours'

In [None]:
train['is_20hours'] = (train['hour']==19).astype(int)

In [None]:
valid['is_20hours'] = (valid['hour']==19).astype(int)

#### Cumulative average of Target

In [None]:
train['target_cum_avg'] = \
    (train.groupby('day_of_year')['target'].cumsum().values
     / train['count_elmnts_by_day'].values)

In [None]:
plt.plot(train['target_cum_avg'][:143*10]);

In [None]:
train['target_cum_avg'] = train['target_cum_avg'].shift(1)

In [None]:
train[['target', 'target_cum_avg']]

In [None]:
scipy.stats.pearsonr(
    train['target_cum_avg'][1:],
    train['target'][1:]
)

In [None]:
scipy.stats.pearsonr(
    train['target_cum_avg'].shift()[2:],
    train['target_cum_avg'][2:]
)

#### Value in previous month

Value of 'target' from previous month minute-by-minute

In [None]:
train['target_previous_month'] = generate_feature_previous_month(train, 'target')

In [None]:
train['target_previous_month']

In [None]:
plt.plot(train.loc['2017-03-05', 'target'], color=palette[1])
plt.plot(train.loc['2017-03-05', 'target_previous_month'], color=palette[2])

In [None]:
plt.plot(train.loc['2017-07-05', 'target'], color=palette[1])
plt.plot(train.loc['2017-07-05', 'target_previous_month'], color=palette[2])

In [None]:
valid['target_previous_month'] = train.loc['2017-07-01':, 'target'].values

## Save Data

In [None]:
datasets['train_preprocessed'] = train

In [None]:
datasets['valid_preprocessed'] = valid

In [None]:
saveit(datasets, 'datasets', dir_save_files)

### Execution time

In [None]:
stopwatch_stop(start)