In [154]:
# run stopwatch
from tools import Stopwatch
stopwatch = Stopwatch()
start = stopwatch.start()

### Load libraries, functions, palette, theme

In [155]:
%run _libraries.ipynb

In [156]:
%run _functions.ipynb

# Section I. Overview

## Load Saved Section if exists

## Load Data

In [157]:
data_raw = pd.read_csv('data/powerconsumption.csv')

In [158]:
data_raw.head()

Unnamed: 0,Datetime,Temperature,Humidity,WindSpeed,GeneralDiffuseFlows,DiffuseFlows,PowerConsumption_Zone1,PowerConsumption_Zone2,PowerConsumption_Zone3
0,1/1/2017 0:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386
1,1/1/2017 0:10,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434
2,1/1/2017 0:20,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373
3,1/1/2017 0:30,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711
4,1/1/2017 0:40,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964


#### Transform names

In [159]:
data_raw = data_raw.rename(columns={
    'PowerConsumption_Zone1': 'target1',
    'PowerConsumption_Zone2': 'target2',
    'PowerConsumption_Zone3': 'target3'
})

In [160]:
data_raw.columns = map(str.lower, data_raw.columns)

#### Date to index

In [161]:
data_raw['datetime'] = pd.to_datetime(data_raw['datetime'])

In [162]:
data_raw = data_raw.set_index(data_raw['datetime'], drop=True)

In [163]:
data_raw.index.name = None

In [164]:
data_raw = data_raw.drop(columns='datetime')

#### Check duplicates

In [165]:
data_raw[data_raw.duplicated()]

Unnamed: 0,temperature,humidity,windspeed,generaldiffuseflows,diffuseflows,target1,target2,target3


## Data Split

In [166]:
data_raw.shape

(52416, 8)

In [167]:
train = data_raw.loc['2017-04': '2017-07'].copy()

In [168]:
valid = data_raw.loc['2017-08'].copy()

In [169]:
test = data_raw.loc['2017-09'].copy()

In [170]:
train.shape

(17568, 8)

In [171]:
valid.shape

(4464, 8)

In [172]:
test.shape

(4320, 8)

## Explore

In [173]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17568 entries, 2017-04-01 00:00:00 to 2017-07-31 23:50:00
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   temperature          17568 non-null  float64
 1   humidity             17568 non-null  float64
 2   windspeed            17568 non-null  float64
 3   generaldiffuseflows  17568 non-null  float64
 4   diffuseflows         17568 non-null  float64
 5   target1              17568 non-null  float64
 6   target2              17568 non-null  float64
 7   target3              17568 non-null  float64
dtypes: float64(8)
memory usage: 1.2 MB


In [174]:
data_describe(train)

Unnamed: 0,Type,Count,Unique,NaN,Percentages
temperature,float64,17568,2389,0,0
humidity,float64,17568,3732,0,0
windspeed,float64,17568,174,0,0
generaldiffuseflows,float64,17568,5910,0,0
diffuseflows,float64,17568,6093,0,0
target1,float64,17568,8645,0,0
target2,float64,17568,10894,0,0
target3,float64,17568,9052,0,0


In [175]:
train.describe()

Unnamed: 0,temperature,humidity,windspeed,generaldiffuseflows,diffuseflows,target1,target2,target3
count,17568.0,17568.0,17568.0,17568.0,17568.0,17568.0,17568.0,17568.0
mean,21.55763,67.454427,2.201019,252.164427,96.620113,33510.768175,20631.371047,21237.662735
std,5.027176,17.46473,2.39947,316.968276,138.072108,7138.352746,4966.213004,6880.889438
min,10.25,11.34,0.05,0.004,0.037,16814.98385,8560.081466,9840.97166
25%,17.8175,56.2,0.072,0.07,0.133,27638.154787,16742.933362,16294.73684
50%,21.09,70.4,0.083,61.15,42.41,33711.09921,20690.83503,19223.27273
75%,24.7,82.6,4.916,506.675,138.6,38501.72185,23895.17507,25995.674638
max,40.01,93.1,4.937,1163.0,909.0,51540.19934,37408.86076,47598.32636


In [None]:
f = plot_gridplot(
    data=train,
    features=train.columns,
    figsize=(10,6),
    ncols=3,
    kind='hist',
    plot_shape='rectangle',
    hscale=1.6,
    histplot_kwargs={'bins': 50})

### Variables lists

In [None]:
features = [
    'temperature', 'humidity', 'windspeed',
    'generaldiffuseflows', 'diffuseflows'
]

In [None]:
months_list = ['April', 'May', 'June', 'July']

### Datetime features

In [None]:
train['minute'] = train.index.minute

In [None]:
train['hour'] = train.index.hour

In [None]:
train['day'] = train.index.day

In [None]:
# day numbers since beginning of dataset
train['day_number'] = train.index.day_of_year - train.index.day_of_year.min() + 1

In [None]:
train['day_of_year'] = train.index.day_of_year

In [None]:
train['weekday'] = train.index.weekday

In [None]:
train['week'] = train.index.isocalendar().week

In [None]:
train['month'] = train.index.month

In [None]:
train["year_month"] = train.index.to_period('M')

In [None]:
train["year_day"] = train.index.to_period('d')

In [None]:
train['count_elmnts_by_day'] = train.groupby('day_of_year').cumcount() + 1

In [None]:
train.head()

### Measurements frequencies check

In [None]:
train.groupby('day_number').size().sort_values()

In [None]:
# assert if daily measurements frequencies not equals
assert (train.groupby('day_number').size().sort_values().min() ==
        train.groupby('day_number').size().sort_values().max())

### Target Variable

In [None]:
plt.hist(train['target1'], bins=100);

In [None]:
for i, m in zip(train['month'].unique(), months_list):
    sns.histplot(
        data=train[train['month']==i],
        x='target1',
        bins=100,
        alpha=1
    );
    plt.title(m)
    plt.show()

In [None]:
plt.hist(train['target1'].diff(), bins=100);

## Naive Model

##### Predictions

In [None]:
y_pred_naive = train['target1'].shift(1).copy()

##### MAE

In [None]:
mean_absolute_error(train['target1'][1:], y_pred_naive[1:])

##### MAPE

In [None]:
mean_absolute_percentage_error(train['target1'][1:], y_pred_naive[1:]) * 100

##### RMSE

In [None]:
root_mean_squared_log_error(train['target1'][1:], y_pred_naive[1:])

## Target1: Discretization and Visualization

### Full Distance with Monthly mean

In [None]:
fig = plt.figure(figsize=(6, 2.5))
# sns.pointplot(
#     data=train,
#     x='month',
#     y='target1',
#     errorbar=('ci'),
#     ls='none',
#     lw=1,
#     markersize=3
# );
sns.barplot(
    data=train,
    x='month',
    y='target1',
    errorbar=('ci'),
    width=0.5,
    color=palette[-3],
    err_kws=dict(color=palette[-4])
);
plt.xticks(arange(4), months_list)
axis_rstyle(
    offset_bottom=15,
    x_lim=[-0.5, 4],
    y_lim=[27500, 37500], y_ticks=[27500, 37500, 2500]
)

plt.xlabel(None)
plt.ylabel(None)
plt.show()

### Full Distance with Weekly mean

In [None]:
sns.lineplot(
    data=train,
    x='week',
    y='target1',
    hue='month',
    errorbar=('ci'),
    palette=palette[:4]
);
maxnloc = mpl.ticker.MaxNLocator(nbins=len(train['week'].unique())+1, integer=True)
ax_current().xaxis.set_major_locator(maxnloc)
axis_rstyle(x_slice=[1,-1], grid=False)
plt.legend(**legend_inline(), **legend_create_handles(labels=months_list))

plt.xlabel(None)
plt.ylabel(None)
plt.show()

### Full Distance with Daily mean

In [None]:
fig = plt.figure()

sns.lineplot(
    data=train,
    x=train.index.to_period('d').to_timestamp(),
    y='target1',
    hue='month',
    palette=[palette[0], palette[1], palette[2], palette[3]],
)

ax = ax_current()
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%d'))
ax.xaxis.set_major_locator(mpl.dates.WeekdayLocator(byweekday=mpl.dates.SU, interval=1))

ax1 = ax.secondary_xaxis('bottom')
ax1.spines['bottom'].set_position(('outward', 20))
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(bottom=False, labelcolor='#909090')
ax1.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%B'))
ax1.xaxis.set_major_locator(mpl.dates.MonthLocator([4, 5, 6, 7]))

axis_rstyle(x_slice=[1,-1], grid=False)

plt.legend(**legend_inline(), **legend_create_handles(labels=months_list))
plt.xlabel(None)
plt.ylabel(None)

plt.show()

##### Anomaly #1
2017-06-25 : 2017-07-02

Diif in 'diffuseflows' first looks suspicious, but after detail check the decision is - it's not the reason of animaly.

In [None]:
plt.plot(
    train.loc['2017-06-25': '2017-07-02'].groupby('year_day').mean().to_timestamp().index,
    train.loc['2017-06-25': '2017-07-02'].groupby('year_day').mean()['target1']
);

In [None]:
plt.plot(
    train.loc['2017-06-24': '2017-06-30'].index,
    train.loc['2017-06-24': '2017-06-30']['target1']
);

In [None]:
train.loc['2017-06-25': '2017-07-02'].groupby('year_day').mean()

In [None]:
fig = plt.figure(figsize=(10, 5))

maxticks = 200
dateslice = slice('2017-06-20', '2017-06-30')
byhour = [6, 12]
fmttime = '%H'

plt.subplot(2, 1, 1)
plt.plot(
    train.loc[dateslice].index,
    train.loc[dateslice]['target1']
);

ax = ax_current()
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt=fmttime))
# ax.xaxis.set_major_locator(mpl.dates.AutoDateLocator(maxticks=maxticks))
ax.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=byhour))

axis_rstyle(offset_left=15)

plt.subplot(2, 1, 2)
plt.plot(
    train.loc[dateslice]['diffuseflows'].diff()
)

ax = ax_current()
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt=fmttime))
# ax.xaxis.set_major_locator(mpl.dates.AutoDateLocator(maxticks=maxticks))
ax.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=byhour))

ax1 = ax.secondary_xaxis('bottom')
ax1.spines['bottom'].set_position(('outward', 18))
ax1.spines['bottom'].set_visible(False)
ax1.xaxis.set_major_formatter(mpl.dates.DateFormatter('%d %b'))
ax1.xaxis.set_major_locator(mpl.dates.DayLocator())
ax1.tick_params(axis='x', bottom=False, labelcolor='#909090')

axis_rstyle(offset_left=15)

In [None]:
plt.plot(
    train.index,
    train['diffuseflows'].diff()
);

### By Months with Daily mean

#### April

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 4), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==4].index,
    train[train['month']==4]['target1'],
    color=palette[0]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    y_ticks=[20000, 50000, 10000], y_lim=[18000, 50000],
    grid=False, ax=ax[0]
)
    
sns.lineplot(
    data=train[train['month']==4],
    x='day',
    y='target1',
    color=palette[0],
    ax=ax[1]
);

ax[1].set_xticks(arange(2, 30, 7, True), labels=['Sunday']*5)
ax[1].set_xticks(arange(2, 30, 7, True))

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    y_ticks=[28000, 36000, 2000], y_lim=[28000, 36000],
    grid=False, x_spine_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.5)
plt.show()

#### May

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 4), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==5].index,
    train[train['month']==5]['target1'],
    color=palette[1]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    y_ticks=[20000, 50000, 10000], y_lim=[18000, 50000],
    grid=False, ax=ax[0]
)
    
sns.lineplot(
    data=train[train['month']==5],
    x='day',
    y='target1',
    color=palette[1],
    ax=ax[1]
);
multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(7, 31, 7, True), labels=['Sunday']*4)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    y_ticks=[28000, 36000, 2000], y_lim=[28000, 36000],
    grid=False, x_spine_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.5)
plt.show()

#### June

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 4), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==6].index,
    train[train['month']==6]['target1'],
    color=palette[2]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    y_ticks=[20000, 50000, 10000], y_lim=[20000, 50000],
    grid=False, ax=ax[0]
)
    
sns.lineplot(
    data=train[train['month']==6],
    x='day',
    y='target1',
    color=palette[2],
    ax=ax[1]
);
multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(4, 30, 7, True), labels=['Sunday']*4)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    y_ticks=[28000, 40000, 4000], y_lim=[28000, 40000],
    grid=False, x_spine_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.5)
plt.show()

#### July

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 4), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==7].index,
    train[train['month']==7]['target1'],
    color=palette[3]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    y_ticks=[20000, 50000, 10000], y_lim=[20000, 50000],
    grid=False, ax=ax[0]
)

sns.lineplot(
    data=train[train['month']==7],
    x='day',
    y='target1',
    color=palette[3],
    ax=ax[1]
);
multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(2, 30, 7, True), labels=['Sunday']*5)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    y_ticks=[30000, 40000, 2500], y_lim=[30000, 40000],
    grid=False, x_spine_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.5)
plt.show()

### Weekdays mean

Create additional features: is_Friday and is_Sunday

In [None]:
sns.lineplot(
    x=train['weekday'],
    y=train['target1']
);

In [None]:
weekdays = np.sort(train['weekday'].unique())

In [None]:
ncols = 2
nrows = math.ceil(len(weekdays)/ncols)

In [None]:
fig = plt.figure()
for i, v in enumerate(weekdays):
    plt.plot(
        train[train['weekday']==v].groupby('hour').mean().index,
        train[train['weekday']==v].groupby('hour').mean()['target1'],
        color=palette[v], lw=1, label=weekday_names_dict[v]
    )
leg = plt.legend(**legend_inline())
for l in leg.legend_handles:
    l.set_linewidth(1.5)
axis_rstyle(x_ticks=[0, 24, 1])

#### Features 'is_Friday' and 'is_Sunday'

In [None]:
train['is_Friday'] = (train['weekday']==4).astype(int)

In [None]:
train['is_Sunday'] = (train['weekday']==6).astype(int)

### Days mean

- Create additional feature: is_31  
- Anomaly suspect near 2017-06-25

In [None]:
sns.lineplot(
    x=train['day'],
    y=train['target1'],
    )
plt.xticks(train.groupby('day').mean().index)
plt.xlabel(None)
plt.ylabel(None)
axis_rstyle()

plt.show()

In [None]:
sns.barplot(
    data=train,
    x='day',
    y='target1',
    color=palette[-3],
    err_kws={'color': palette[-4]}
)
ax = plt.gca()
ax.set_ylim(30000, 38000)
ax.set_xlabel(None)
ax.set_ylabel(None)

els_count = train.groupby('day').count().iloc[:, 0]
for bar, count in zip(ax.patches, els_count):
    ax.text(
        bar.get_x()+bar.get_width()/1.75, 30250, count,
        size=8,  color='#FEFEFE', weight='bold',
        ha='center', va='bottom', rotation=90) 

axis_rstyle()

#### Feature 'is_31'

In [None]:
train['is_31'] = (train['day']==31).astype(int)

#### Daily

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(10, 5))

ax[0].errorbar(
    range(len(train[train['weekday']==3].groupby('hour').mean())),
    train[train['weekday']==3].groupby('hour').mean()['target1'],
    train[train['weekday']==3].groupby('hour')['target1'].std(),
    ls='none',
    marker='o',
    markersize=2.5,
    lw=1,
    capsize=1.5
);
axis_rstyle(
    ax=ax[0], x_ticks=[0, 23, 1], y_ticks=[10000, 60000, 10000],
    margin=True)
ax[0].set_ylim(10000, 60000)

sns.boxplot(
    y=train['target1'],
    x=train['hour'],
    width=0.25,
    fill=False,
    linewidth=1,
    color=palette[-3],
    flierprops=dict(markersize=1.5),
    ax=ax[1]
)
axis_rstyle(
    ax=ax[1], x_ticks=[0, 23, 1], y_ticks=[10000, 60000, 10000],
    margin=True)
ax[1].set_xlabel(None)
ax[1].set_ylabel(None)
ax[1].set_ylim(10000, 60000)

plt.show()

In [None]:
fig = plt.figure()
for i in arange(4,7, True):
    sns.lineplot(
        x=train.loc[f'2017-{str(i).zfill(2)}']['hour'],
        y=train.loc[f'2017-{str(i).zfill(2)}']['target1'],
        color=palette[i-4],
        label=i
    )
    plt.axhline(
        train.loc[f'2017-{str(i).zfill(2)}']['target1'].mean(),
        0.01, 0.99, ls='--', alpha=0.75, color=palette[i-5])
    
plt.xticks(range(24))
plt.legend(**legend_create_handles(labels=months_list), **legend_inline())
axis_rstyle()

plt.xlabel(None)
plt.ylabel(None)

plt.show()

In [None]:
sns.histplot(
    x=train[train['hour']==2]['target1'],
    hue=train[train['hour']==2]['year_month'],
    bins=100
)
plt.legend(
    **legend_create_handles(
        4, kind='rect', alpha=0.5, labels=months_list),
    **legend_inline(4))
axis_rstyle()

plt.xlabel(None)
plt.ylabel(None)

plt.show()

## Stationarity

p< 0.05: Enough evidence to say, that process is stationarity

In [None]:
# NULL: non-stationarity
sm.tsa.stattools.adfuller(train['target1'])

## Trend and Seasonality of aggregated by Day data

In [None]:
stl_day = statsmodels.tsa.seasonal.STL(train.resample('d').mean()['target1']).fit()

In [None]:
fig = plt.figure(figsize=(10, 8))

plt.subplot(4,1,1)
plt.plot(stl_day.observed)

plt.subplot(4,1,2)
plt.plot(stl_day.trend)

plt.subplot(4,1,3)
plt.plot(stl_day.seasonal)

plt.subplot(4,1,4)
plt.bar(x=train.resample('D').mean().index, height=stl_day.resid, width=0.5)

plt.show()

#### Detrending

##### Differencing

In [None]:
train['target1_diff'] = train['target1'].diff().copy()

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_diff']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

##### Returns

In [None]:
train['target1_returns'] = train['target1'].pct_change(1).mul(100)

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_returns']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

##### Volatility

In [None]:
train['target1_volatility'] = (train['target1_returns']**2).copy()

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_volatility']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

##### Anomaly #2

index: 2017-04-20 12:10:00  
volatility: 2862.029840

In [None]:
train['target1_volatility'].sort_values()

In [None]:
train.loc['2017-04-20 11:40':'2017-04-20 13:0']

In [None]:
plt.plot(
    train.loc['2017-04-20 11:40:00':'2017-04-20 13:00:00'].index,
    train.loc['2017-04-20 11:40:00':'2017-04-20 13:00:00']['target1']
);

In [None]:
fig, ax = plt.subplots(3,1, figsize=(10, 7.5))

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i].plot(
        train.loc['2017-04-17':'2017-04-23'].index,
        train.loc['2017-04-17':'2017-04-23'][v]
    );

In [None]:
train.loc['2017-04-20'].index.day_name()[0]

##### Anomaly #3

index: 2017-05-31 11:30:00  
volatility: 438.955206

In [None]:
train['target1_volatility'].sort_values()

In [None]:
train.loc['2017-05-31 11:15':'2017-05-31 11:55']

In [None]:
plt.plot(
    train.loc['2017-05-31 11:00':'2017-05-31 12:30'].index,
    train.loc['2017-05-31 11:00':'2017-05-31 12:30']['target1']
);

In [None]:
fig, ax = plt.subplots(3,1, figsize=(10, 7.5))

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i].plot(
        train.loc['2017-05-30':'2017-06-02'].index,
        train.loc['2017-05-30':'2017-06-02'][v]
    );

## Autocorrelation

#### ACF - 40 period

In [None]:
acf40 = ts_acf_calculate(train['target1'], lags=40)

In [None]:
plot_acf(acf40)

In [None]:
ts_acf_last_significant_index(acf40[:, 1:])

#### ACF - 300 period

In [None]:
acf300 = ts_acf_calculate(train['target1'], lags=300)

In [None]:
plot_acf(acf300, transparency_lines=0.25)

#### PACF - 40 period

In [None]:
pacf40 = ts_acf_calculate(train['target1'], lags=40, partial=True)

In [None]:
plot_acf(pacf40, scatter=True)

In [None]:
ts_acf_last_significant_index(pacf40[:, 1:])

### Scatterplots Matrix

No non-linear relationships

In [None]:
ts_scatterplot_matrix(
    train['target1'],
    lags=12, ncols=3, s=0.5, figsize=(9, 7))

In [None]:
scipy.stats.pearsonr(
    train['target1_diff'][2:],
    train['target1_diff'].shift(1)[2:]
)

In [None]:
scipy.stats.pearsonr(
    train['target1_returns'][2:],
    train['target1_returns'].shift(1)[2:]
)

In [None]:
scipy.stats.pearsonr(
    train['target1_volatility'][2:],
    train['target1_volatility'].shift(1)[2:]
)

#### Predictors

### P

xt = (1 + pt)xt−1

In [None]:
train['p'] = train['target1'] / train['target1'].shift(1) - 1

In [None]:
plt.hist(train['p'], bins=150);

In [None]:
plt.hist(train.loc[train['p']<-0.05, 'p'], bins=100);

In [None]:
plt.hist(train.loc[train['p']>0.05, 'p'], bins=100);

## Feature Engineering

In [None]:
train.head(3)

#### Cumulative average of Target1

In [None]:
train['target1_cum_avg'] = \
    (train.groupby('day_of_year')['target1'].cumsum().values
     / train['count_elmnts_by_day'].values)

In [None]:
plt.plot(train['target1_cum_avg'][:143*10]);

In [None]:
scipy.stats.pearsonr(
    train['target1_cum_avg'].shift()[1:],
    train['target1'][1:]
)

In [None]:
scipy.stats.pearsonr(
    train['target1_cum_avg'].shift()[1:],
    train['target1_cum_avg'][1:]
)

### Save Session

### Execution time

In [None]:
print(f'Execution time: {stopwatch.stop(start)}')