In [1]:
# run stopwatch
from tools import Stopwatch
stopwatch = Stopwatch()
start = stopwatch.start()

### Load libraries, functions, palette, theme

In [None]:
%run _libraries.ipynb

In [None]:
plt.plot(
    train.loc['2017-07', 'target1']
)
axis_rstyle()

In [None]:
%run _functions.ipynb

In [None]:
dir_current = '01-explore'

In [None]:
dir_save_files = 'files/'

In [None]:
dir_save_img = 'docs/img/'

In [None]:
session_name = dir_current

# Section I. Research

## Load Saved Section if exists

## Load Data

In [None]:
data_raw = pd.read_csv('data/powerconsumption.csv')

In [None]:
data_raw.head()

#### Transform names

In [None]:
data_raw = data_raw.rename(columns={
    'PowerConsumption_Zone1': 'target1',
    'PowerConsumption_Zone2': 'target2',
    'PowerConsumption_Zone3': 'target3'
})

In [None]:
data_raw.columns = map(str.lower, data_raw.columns)

#### Date to index

In [None]:
data_raw['datetime'] = pd.to_datetime(data_raw['datetime'])

In [None]:
data_raw = data_raw.set_index('datetime', drop=True).asfreq('10min')

In [None]:
data_raw.index.name = None

In [None]:
data_raw.head()

#### Check NaNs

In [None]:
is_nan(data_raw)

#### Check duplicates

In [None]:
data_raw[data_raw.duplicated()]

## Data Split

In [None]:
data_raw.shape

In [None]:
train = data_raw.loc['2017-01': '2017-07'].copy()

In [None]:
train = train.asfreq('10min').copy()

In [None]:
valid = data_raw.loc['2017-08'].copy()

In [None]:
valid = valid.asfreq('10min').copy()

In [None]:
test = data_raw.loc['2017-09'].copy()

In [None]:
test = test.asfreq('10min').copy()

In [None]:
train.shape

In [None]:
valid.shape

In [None]:
test.shape

## Explore

In [None]:
train.info()

In [None]:
data_describe(train)

In [None]:
train.describe()

In [None]:
f = plot_gridplot(
    data=train,
    features=train.columns,
    figsize=(10,6),
    ncols=3,
    kind='hist',
    plot_shape='rectangle',
    hscale=1.6,
    histplot_kwargs={'bins': 50})

### Variables lists

In [None]:
features = [
    'temperature', 'humidity', 'windspeed',
    'generaldiffuseflows', 'diffuseflows'
]

### Datetime features

In [None]:
train['minute'] = train.index.minute
valid['minute'] = valid.index.minute

In [None]:
train['hour'] = train.index.hour
valid['hour'] = valid.index.hour

In [None]:
train['day'] = train.index.day
valid['day'] = valid.index.day

In [None]:
# day numbers since beginning of dataset
train['day_number'] = train.index.day_of_year - train.index.day_of_year.min() + 1
valid['day_number'] = valid.index.day_of_year - valid.index.day_of_year.min() + 1

In [None]:
train['day_of_year'] = train.index.day_of_year
valid['day_of_year'] = valid.index.day_of_year

In [None]:
train['weekday'] = train.index.weekday
valid['weekday'] = valid.index.weekday

In [None]:
train['week'] = train.index.isocalendar().week
train['week'] = train['week'].astype(int)
# train.loc['2017-01-01', 'week'] = 1
valid['week'] = valid.index.isocalendar().week
valid['week'] = valid['week'].astype(int)

In [None]:
train['month'] = train.index.month
valid['month'] = valid.index.month

In [None]:
train["year_month"] = train.index.to_period('M')
# train["year_month"] = train["year_month"].astype(str)
valid["year_month"] = valid.index.to_period('M')

In [None]:
train["year_day"] = train.index.to_period('d')
# train["year_day"] = train["year_day"].astype(str)
valid["year_day"] = valid.index.to_period('d')

In [None]:
train['count_elmnts_by_day'] = train.groupby('day_of_year').cumcount() + 1
valid['count_elmnts_by_day'] = valid.groupby('day_of_year').cumcount() + 1

In [None]:
train

In [None]:
months_list = months_list[:len(train['month'].unique())]

In [None]:
months_list

### Measurements frequencies check

In [None]:
# assert if daily measurements frequencies not equals
assert (train.groupby('day_number').size().sort_values().min() ==
        train.groupby('day_number').size().sort_values().max())

In [None]:
train.groupby('day_number').size()[1]

In [None]:
train.groupby('weekday').size()

In [None]:
4464 - 4320

In [None]:
4464/144

In [None]:
4320/144

In [None]:
print(train.groupby('week').size())

In [None]:
train['week'].unique()

In [None]:
train.groupby('month').count()['target1']

### Target Variable

In [None]:
plt.hist(train['target1'], bins=100);

In [None]:
for i, m in zip(train['month'].unique(), months_list):
    sns.histplot(
        data=train[train['month']==i],
        x='target1',
        bins=100,
        alpha=1
    );
    plt.title(m)
    plt.show()

In [None]:
plt.hist(train['target1'].diff(1), bins=150);
axis_rstyle(xlim=[-10000, 10000])

In [None]:
train['target1'].diff(1).sort_values()

## Naive Model

##### Predictions

In [None]:
y_pred_naive = train['target1'].shift(1)[1:].copy()

In [None]:
y_true = train['target1'][1:]

##### R-squared

In [None]:
round(r2_score(y_true, y_pred_naive), 3)

##### MAE

In [None]:
math.ceil(mean_absolute_error(y_true, y_pred_naive))

##### MAPE

In [None]:
round(mean_absolute_percentage_error(y_true, y_pred_naive) * 100, 2)

##### RMSE

In [None]:
math.ceil(root_mean_squared_error(y_true, y_pred_naive))

## Target1 Visualization

### Full Distance with Monthly mean

In [None]:
fig = plt.figure(figsize=(7, 2.5))

sns.barplot(
    data=train,
    x='month',
    y='target1',
    errorbar=('ci'),
    width=0.4,
    color=palette[-3],
    err_kws=dict(color=palette[-4])
);
plt.xticks(arange(len(months_list)), months_list)
axis_rstyle(
    offset_bottom=10, offset_left=20,
    ylim=[27500, 37500], yticks=[27500, 37500, 2500]
)
axis_adjust_barplot(line_hidden=True)

plt.xlabel(None)
plt.ylabel(None)
plt.show()

### Full Distance with Weekly mean

In [None]:
sns.lineplot(
    data=train[train['week'] != 52],
    x='week',
    y='target1',
    hue='month',
    errorbar=('ci'),
    palette=palette[:7]
);
maxnloc = mpl.ticker.MaxNLocator(nbins=len(train['week'].unique())+1, integer=True)
ax_current().xaxis.set_major_locator(maxnloc)
axis_rstyle(xslice=[1, -1], grid=False)
plt.legend(**legend_inline(), **legend_create_handles(labels=months_list))

plt.xlabel(None)
plt.ylabel(None)
plt.show()

### Full Distance with Daily mean

In [None]:
fig = plt.figure()

sns.lineplot(
    data=train.loc['2017-03-01':],
    x=train.loc['2017-03-01':].index.to_period('d').to_timestamp(),
    y='target1',
    hue='month',
    palette=palette[:5]
)

axis_rstyle(yticks=[25000, 40000, 2500], grid=False)

ax = ax_current()
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%d'))
ax.xaxis.set_major_locator(mpl.dates.DayLocator(bymonthday=[1], interval=1))
ax.xaxis.set_minor_formatter(mpl.dates.DateFormatter(fmt='%d'))
ax.xaxis.set_minor_locator(mpl.dates.DayLocator(bymonthday=[10, 20], interval=1))
ax.tick_params(axis='x', which='major', direction='out', size=4.5, pad=3.5, labelsize=8)
ax.tick_params(axis='x', which='minor', direction='out', size=3, pad=5, labelsize=8)

ax1 = ax.secondary_xaxis('bottom')
ax1.spines['bottom'].set_position(('outward', 23))
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(bottom=False, labelcolor='#6C7984')
ax1.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%B'))
ax1.xaxis.set_major_locator(mpl.dates.DayLocator(bymonthday=15))
ax1.tick_params(axis='x', labelsize=9)

# plt.legend(
#     **legend_mid(bbox_to_anchor=(1.025, 1)),
#     **legend_create_handles(kind='point', labels=months_list))
plt.legend(
    **legend_inline(bbox_to_anchor=(0, 1)),
    **legend_create_handles(
        kind='point', labels=months_list[2:7]))
plt.xlabel(None)
plt.ylabel('target1')

plt.show()

savefig('full_data_lineplot', dir_save_img)

### By Months with Daily mean

#### March

In [None]:
len(train[train['month']==3].index.floor('D').unique())

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 3), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==3].index,
    train[train['month']==3]['target1'],
    color=palette[0]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    yticks=[20000, 50000, 10000], ylim=[18000, 50000],
    grid=False, ax=ax[0]
)
    
sns.lineplot(
    data=train[train['month']==3],
    x='day',
    y='target1',
    color=palette[0],
    ax=ax[1]
);

ax[1].set_xticks(arange(5, 30, 7, True), labels=['Sunday']*4)
ax[1].set_xticks(arange(5, 30, 7, True))

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    yticks=[28000, 36000, 2000], ylim=[28000, 36000],
    grid=False, x_axis_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.75)
plt.show()

savefig('march_data', dir_save_img)

#### April

In [None]:
fig, ax = plt.subplots(2,1, figsize=(8, 3), sharex=False)

ax[0].plot(train[train['month']==4].index,
    train[train['month']==4]['target1'],
    color=palette[1])

fmt = mpl.dates.DateFormatter('%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    yticks=[20000, 50000, 10000], ylim=[18000, 50000],
    grid=False, ax=ax[0])

ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)
    
sns.lineplot(
    data=train[train['month']==4],
    x='day',
    y='target1',
    color=palette[1],
    ax=ax[1])

ax[1].set_xticks(arange(2, 30, 7, True), labels=['Sunday']*5)
ax[1].set_xticks(arange(2, 30, 7, True))

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    yticks=[28000, 36000, 2000], ylim=[28000, 36000],
    grid=False, x_axis_hide=True
)

plt.xlabel(None)
plt.ylabel(None)
ax[0].set_title('April')

plt.subplots_adjust(hspace=0.75)
plt.show()

savefig('april_data', dir_save_img)

#### May

In [None]:
fig, ax = plt.subplots(2,1, figsize=(8, 3), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==5].index,
    train[train['month']==5]['target1'],
    color=palette[2]
);
fmt = mpl.dates.DateFormatter('%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    yticks=[20000, 50000, 10000], ylim=[18000, 50000],
    grid=False, ax=ax[0]
)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)
    
sns.lineplot(
    data=train[train['month']==5],
    x='day',
    y='target1',
    color=palette[2],
    ax=ax[1])

multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(7, 31, 7, True), labels=['Sunday']*4)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    yticks=[28000, 36000, 2000], ylim=[28000, 36000],
    grid=False, x_axis_hide=True
)

plt.xlabel(None)
plt.ylabel(None)
ax[0].set_title('May')

plt.subplots_adjust(hspace=0.75)
plt.show()

savefig('may_data', dir_save_img)

#### June

In [None]:
fig, ax = plt.subplots(2,1, figsize=(8, 3), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==6].index,
    train[train['month']==6]['target1'],
    color=palette[3])

fmt = mpl.dates.DateFormatter('%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    yticks=[20000, 50000, 10000], ylim=[20000, 50000],
    grid=False, ax=ax[0])

ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)
    
sns.lineplot(
    data=train[train['month']==6],
    x='day',
    y='target1',
    color=palette[3],
    ax=ax[1])

multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(4, 30, 7, True), labels=['Sunday']*4)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    yticks=[28000, 40000, 4000], ylim=[28000, 40000],
    grid=False, x_axis_hide=True)

plt.xlabel(None)
plt.ylabel(None)
ax[0].set_title('June')

plt.subplots_adjust(hspace=0.75)
plt.show()

savefig('june_data', dir_save_img)

#### July

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10, 3), sharex=False)

# plt.subplot(2, 1, 1)
ax[0].plot(train[train['month']==7].index,
    train[train['month']==7]['target1'],
    color=palette[4]
);
fmt = mpl.dates.DateFormatter('%m-%d')
ax[0].xaxis.set_major_formatter(fmt)

axis_rstyle(
    offset_bottom=15, offset_left=10,
    yticks=[20000, 50000, 10000], ylim=[20000, 50000],
    grid=False, ax=ax[0]
)

sns.lineplot(
    data=train[train['month']==7],
    x='day',
    y='target1',
    color=palette[4],
    ax=ax[1]
);
multloc = mpl.ticker.MultipleLocator(base=7, offset=0)
ax[1].xaxis.set_major_locator(multloc)
ax[1].set_xticks(arange(2, 30, 7, True), labels=['Sunday']*5)

axis_rstyle(
    ax=ax[1], offset_bottom=15, offset_left=10, 
    yticks=[30000, 40000, 2500], ylim=[30000, 40000],
    grid=False, x_axis_hide=True
)

plt.xlabel(None)
plt.ylabel(None)

plt.subplots_adjust(hspace=0.75)
plt.show()

savefig('july_data', dir_save_img)

### Weekdays mean

Create additional features: is_Friday and is_Sunday

In [None]:
fig = plt.figure(figsize=(8, 1.5))

sns.lineplot(
    x=train['weekday'],
    y=train['target1']
);
axis_rstyle(xslice=[1,-1], yticks=[30500, 33500], offset_bottom=15, offset_left=10)
ax_current().set_xticklabels(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.xlabel(None)
plt.show()

savefig('weekdays_data', dir_save_img)

In [None]:
weekdays = np.sort(train['weekday'].unique())

In [None]:
ncols = 2
nrows = math.ceil(len(weekdays)/ncols)

In [None]:
fig = plt.figure()
for i, v in enumerate(weekdays):
    plt.plot(
        train[train['weekday']==v].groupby('hour').mean().index,
        train[train['weekday']==v].groupby('hour').mean()['target1'],
        color=palette[v], lw=1, label=weekday_names_dict[v]
    )
leg = plt.legend(**legend_inline())
for l in leg.legend_handles:
    l.set_linewidth(1.5)
axis_rstyle(xticks=[0, 23, 1])

savefig('data_by_hours', dir_save_img)

### Days mean

- Create additional feature: is_31  
- Anomaly suspect near 2017-06-25

In [None]:
sns.lineplot(
    x=train['day'],
    y=train['target1'],
    )
plt.xticks(train.groupby('day').mean().index)
plt.xlabel(None)
plt.ylabel(None)
axis_rstyle()

plt.show()

In [None]:
fig = plt.figure(figsize=(8, 2))

sns.barplot(
    data=train,
    x='day',
    y='target1',
    color=palette[-3],
    err_kws={'color': palette[-4]}
)
ax = plt.gca()
ax.set_ylim(30000, 35000)
ax.set_xlabel(None)
ax.set_ylabel(None)

els_count = train.groupby('day').count().iloc[:, 0]
for bar, count in zip(ax.patches, els_count):
    ax.text(
        bar.get_x()+bar.get_width()/1.75, 30250, count,
        size=8,  color='#FEFEFE', weight='bold',
        ha='center', va='bottom', rotation=90) 

axis_rstyle()
ax.tick_params(bottom=False, pad=5)
ax.set_ylabel('target1')

savefig('data_by_days', dir_save_img)

#### Daily

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 2))

# ax[0].errorbar(
#     range(len(train[train['weekday']==3].groupby('hour').mean())),
#     train[train['weekday']==3].groupby('hour').mean()['target1'],
#     train[train['weekday']==3].groupby('hour')['target1'].std(),
#     ls='none',
#     marker='o',
#     markersize=2.5,
#     lw=1,
#     capsize=1.5
# );
# axis_rstyle(
#     ax=ax[0], xticks=[0, 23, 1], yticks=[10000, 60000, 10000],
#     margin=True)
# ax[0].set_ylim(10000, 60000)

sns.boxplot(
    y=train['target1'],
    x=train['hour'],
    width=0.25,
    fill=False,
    linewidth=1,
    color=palette[-3],
    flierprops=dict(markersize=1),
    ax=ax
)
axis_rstyle(
    ax=ax, xticks=[0, 23, 1], yticks=[10000, 50000, 10000],
    margin=True)
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_ylim(10000, 55000)
ax.set_ylabel('target1')

plt.subplots_adjust(hspace=0.35)
plt.show()

savefig('data_daily', dir_save_img)

In [None]:
fig = plt.figure()
for i in arange(1, 7, True):
    sns.lineplot(
        x=train.loc[f'2017-{str(i).zfill(2)}']['hour'],
        y=train.loc[f'2017-{str(i).zfill(2)}']['target1'],
        color=palette[i-1],
        label=i
    )
    plt.axhline(
        train.loc[f'2017-{str(i).zfill(2)}']['target1'].mean(),
        0.01, 0.99, ls='--', alpha=0.75, color=palette[i-1])
    
plt.xticks(range(24))
plt.legend(**legend_create_handles(labels=months_list, kind='square'), **legend_inline())
axis_rstyle()

plt.xlabel(None)
plt.ylabel(None)

plt.show()

In [None]:
fig = plt.figure(figsize=(6, 9))
for i in arange(6, True):
    plt.subplot(7, 1, i+1)
    sns.histplot(
        x=train[(train['hour']==3) & (train['year_month']==f'2017-0{i+1}')]['target1'],
        color=palette[i],
        bins=100,
        label=months_list[i],
        alpha=1
    )
    
    plt.legend(
        **legend_mid(bbox_to_anchor=(1.03, 1)),
        **legend_create_handles(kind='square', colors=[palette[i]]))
    plt.xlabel(None)
    plt.ylabel(None)
    plt.xlim(15000, 45000)
    axis_rstyle(offset_bottom=2)
    plt.subplots_adjust(hspace=0.75)

## Differencing

### 1-lag

In [None]:
train['target1_diff'] = train['target1'].diff().copy()

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_diff']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

In [None]:
plt.hist(train['target1_diff'], bins=100);

In [None]:
train['target1_diff'].sort_values().head(10)

In [None]:
train['target1_diff'].sort_values().tail(10)

### Returns

In [None]:
train['target1_returns'] = train['target1'].pct_change(1).mul(100)

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_returns']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

In [None]:
plt.hist(train['target1_returns'], bins=100);

In [None]:
train['target1_returns'].sort_values().head(10)

In [None]:
train['target1_returns'].sort_values().tail(10)

### Volatility

In [None]:
train['target1_volatility'] = (train['target1'].pct_change(1)**2).copy()

In [None]:
fig = plt.figure(figsize=(10, 5))

plt.subplot(2,1,1)
plt.plot(
    train.index,
    train['target1_volatility']
);
plt.subplot(2,1,2)
plt.plot(
    train.index,
    train['target1']
);

In [None]:
fig = plt.figure(figsize=(8, 2.5))

plt.plot(
    train.loc['2017-03-01':].index,
    train.loc['2017-03-01':, 'target1_volatility']
)

axis_rstyle(ylim=[0, 0.31])
plt.show()

savefig('anomaly_plot', dir_save_img)

## Anomalies

##### Anomaly #1
2017-06-25 : 2017-07-02

- Diif in 'diffuseflows' first looks suspicious, but after detail check the decision is - it's not the reason of animaly.
- Generally, don't looks like an accident. Volatility and returns not extreme. Other targets demonstrates similar behavior over this time preiod.

__Cause__: protests  
https://www.thenation.com/article/archive/is-morocco-headed-toward-insurrection/

In [None]:
fig, ax = plt.subplots(4,1, figsize=(8, 9))

ax[0].plot(
    train.loc['2017-06-23':'2017-06-30'].index,
    train.loc['2017-06-23':'2017-06-30']['target1_volatility'])

ax[0].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
ax[0].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
ax[0].tick_params(axis='x', which='major', labelsize=7)

axis_second = axis_secondary(ax=ax[0])
axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))

axis_rstyle(xslice=[1, -1], ylim=[0, 0.01], ax=ax[0])

ax[0].xaxis.get_major_ticks()[0].label1.set_visible(False)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i+1].plot(
        train.loc['2017-06-23':'2017-06-30'].index,
        train.loc['2017-06-23':'2017-06-30'][v])
    
    ax[i+1].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
    ax[i+1].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
    ax[i+1].tick_params(axis='x', which='major', labelsize=7)
    
    axis_second = axis_secondary(ax=ax[i+1])
    axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
    axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))
    
    axis_rstyle(xslice=[1, -1], ax=ax[i+1])
    
titles = ['Volatility', 'target1', 'target2', 'target3']
for ax_, title in zip(ax, titles):
    ax_.set_ylabel(title)

plt.subplots_adjust(hspace=0.65)
plt.show()

savefig('anomaly1', dir_save_img)

##### Anomaly #2

index: 2017-04-20 12:10:00  
volatility: 2862.029840

In [None]:
train.loc['2017-04-20'].index.day_name()[0]

In [None]:
fig, ax = plt.subplots(4,1, figsize=(8, 9))

ax[0].plot(
    train.loc['2017-04-17':'2017-04-23'].index,
    train.loc['2017-04-17':'2017-04-23']['target1_volatility'])

ax[0].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
ax[0].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
ax[0].tick_params(axis='x', which='major', labelsize=7)

axis_second = axis_secondary(ax=ax[0])
axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))

axis_rstyle(xslice=[1, -1], ylim=[0, 0.31], ax=ax[0])

ax[0].xaxis.get_major_ticks()[0].label1.set_visible(False)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i+1].plot(
        train.loc['2017-04-17':'2017-04-23'].index,
        train.loc['2017-04-17':'2017-04-23'][v])

    ax[i+1].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
    ax[i+1].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
    ax[i+1].tick_params(axis='x', which='major', labelsize=7)
    
    axis_second = axis_secondary(ax=ax[i+1])
    axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
    axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))
    
    axis_rstyle(xslice=[1, -1], ax=ax[i+1])

titles = ['Volatility', 'target1', 'target2', 'target3']
for ax_, title in zip(ax, titles):
    ax_.set_ylabel(title)

plt.subplots_adjust(hspace=0.65)
plt.show()

savefig('anomaly2', dir_save_img)

##### Anomaly #3

index: 2017-04-20 12:10:00  
volatility: 2862.029840

In [None]:
fig, ax = plt.subplots(4,1, figsize=(8, 9))

ax[0].plot(
    train.loc['2017-03-25':'2017-03-31'].index,
    train.loc['2017-03-25':'2017-03-31']['target1_volatility'])

ax[0].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
ax[0].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
ax[0].tick_params(axis='x', which='major', labelsize=7)

axis_second = axis_secondary(ax=ax[0])
axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))

axis_rstyle(xslice=[1, -1], ylim=[0, 0.025], ax=ax[0])

ax[0].xaxis.get_major_ticks()[0].label1.set_visible(False)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i+1].plot(
        train.loc['2017-03-25':'2017-03-31'].index,
        train.loc['2017-03-25':'2017-03-31'][v])

    ax[i+1].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
    ax[i+1].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
    ax[i+1].tick_params(axis='x', which='major', labelsize=7)
    
    axis_second = axis_secondary(ax=ax[i+1])
    axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
    axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))
    
    axis_rstyle(xslice=[1, -1], ax=ax[i+1])

titles = ['Volatility', 'target1', 'target2', 'target3']
for ax_, title in zip(ax, titles):
    ax_.set_ylabel(title)

plt.subplots_adjust(hspace=0.65)
plt.show()

savefig('anomaly3', dir_save_img)

##### Anomaly #4

index: 2017-05-31 11:30:00  
volatility: 438.955206

In [None]:
fig, ax = plt.subplots(4,1, figsize=(8, 9))

ax[0].plot(
    train.loc['2017-05-28':'2017-06-03'].index,
    train.loc['2017-05-28':'2017-06-03']['target1_volatility']
)

ax[0].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
ax[0].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
ax[0].tick_params(axis='x', which='major', labelsize=7)

axis_second = axis_secondary(ax=ax[0])
axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))

axis_rstyle(xslice=[1, -1], ylim=[0, 0.05], ax=ax[0])

ax[0].xaxis.get_major_ticks()[0].label1.set_visible(False)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i+1].plot(
        train.loc['2017-05-28':'2017-06-03'].index,
        train.loc['2017-05-28':'2017-06-03'][v]
    );

    ax[i+1].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
    ax[i+1].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
    ax[i+1].tick_params(axis='x', which='major', labelsize=7)
    
    axis_second = axis_secondary(ax=ax[i+1])
    axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
    axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))
    
    axis_rstyle(xslice=[1, -1], ax=ax[i+1])

titles = ['Volatility', 'target1', 'target2', 'target3']
for ax_, title in zip(ax, titles):
    ax_.set_ylabel(title)

plt.subplots_adjust(hspace=0.65)
plt.show()

savefig('anomaly4', dir_save_img)

##### Anomaly #5

index: '2017-06-03':'2017-06-09'

In [None]:
fig, ax = plt.subplots(4,1, figsize=(8, 9))

ax[0].plot(
    train.loc['2017-06-03':'2017-06-09'].index,
    train.loc['2017-06-03':'2017-06-09']['target1_volatility']
)

ax[0].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
ax[0].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
ax[0].tick_params(axis='x', which='major', labelsize=7)

axis_second = axis_secondary(ax=ax[0])
axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))

axis_rstyle(xslice=[1, -1], ylim=[0, 0.03], ax=ax[0])

ax[0].xaxis.get_major_ticks()[0].label1.set_visible(False)
ax[0].xaxis.get_major_ticks()[-1].label1.set_visible(False)

for i, v in enumerate(['target1', 'target2', 'target3']):
    ax[i+1].plot(
        train.loc['2017-06-03':'2017-06-09'].index,
        train.loc['2017-06-03':'2017-06-09'][v]
    );

    ax[i+1].xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%H'))
    ax[i+1].xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[6, 12, 18]))
    ax[i+1].tick_params(axis='x', which='major', labelsize=7)
    
    axis_second = axis_secondary(ax=ax[i+1])
    axis_second.xaxis.set_major_formatter(mpl.dates.DateFormatter(fmt='%m-%d'))
    axis_second.xaxis.set_major_locator(mpl.dates.HourLocator(byhour=[12]))
    
    axis_rstyle(xslice=[1, -1], ax=ax[i+1])

titles = ['Volatility', 'target1', 'target2', 'target3']
for ax_, title in zip(ax, titles):
    ax_.set_ylabel(title)

plt.subplots_adjust(hspace=0.65)
plt.show()

savefig('anomaly5', dir_save_img)

### Scatterplots Matrix

No non-linear relationships

#### Predictors

### P

xt = (1 + pt)xt−1

In [None]:
train['p'] = train['target1'] / train['target1'].shift(1) - 1

In [None]:
plt.hist(train['p'], bins=150);

In [None]:
plt.hist(train.loc[train['p']<-0.05, 'p'], bins=100);

In [None]:
plt.hist(train.loc[train['p']>0.05, 'p'], bins=100);

In [None]:
train['p'].mean()

## Save Data

In [None]:
saveit(train, 'train_raw', dir_save_files)

In [None]:
saveit(valid, 'valid_raw', dir_save_files)

In [None]:
saveit(test, 'test_raw', dir_save_files)

### Save Session

In [None]:
save_session(session_name)

### Execution time

In [None]:
print(f'Execution time: {stopwatch.stop(start)}')