# ЛАБОРАТОРИЯ

# Часть III. Продвинутый подход к прогнозированию

### Оглавление

[Библиотеки и утилиты](#Библиотеки-и-утилиты)

[Загрузка данных](#Загрузка-данных)

[Взаимосвязь рядов](#Взаимосвязь-рядов)

[Формирование данных для обучения](#Формирование-данных-для-обучения)

[Обучающая, тестовая и валидационная выборки](#Обучающая,-тестовая-и-валидационная-выборки)

[Обучение модели](#Обучение-модели)

[Оценка результата](#Оценка-результата)

### Библиотеки и утилиты

In [None]:
import os
import json
import pickle
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datetime import datetime
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM, GRU, TimeDistributed
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from hyperopt import hp, tpe, space_eval
from hyperopt.fmin import fmin
pd.set_option('display.max_columns', None)
print('tensorflow version:', tf.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for gpu_device in gpu_devices:
        print('device available:', gpu_device)

In [None]:
MODEL_PATH = './models_adv'
if not os.path.exists(MODEL_PATH):
    os.mkdir(MODEL_PATH)

def set_all_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_all_seeds(2020)

### Загрузка данных

In [None]:
df = pd.read_csv('cpt_power_data.csv', sep='\t', encoding='utf-8', index_col=0)
df['timestamp_value'] = pd.to_datetime(df['timestamp_value'])
df.head()

In [None]:
plt.figure(figsize=(16, 6))
for ch_serial in df['measuringpoint_serial'].unique():
    plt.plot(df[df['measuringpoint_serial'] == ch_serial].timestamp_value, 
             df[df['measuringpoint_serial'] == ch_serial].value_text, 
             label=ch_serial)
plt.legend()
plt.show()

### Взаимосвязь рядов

In [None]:
df_s = df.pivot('timestamp_value', 'measuringpoint_serial', 'value_text').reset_index()
df_s.columns = [df_s.columns[0]] + [df_s.columns.name + '_' + str(col) for col in df_s.columns[1:]]
df_s.head()

In [None]:
plt.figure(figsize=(16, 10 * len(df_s.columns[1:])))
for i, col in enumerate(df_s.columns[1:]):
    plt.subplot(len(df.columns[1:]), 1, i + 1)
    plt.plot(df_s.timestamp_value, df_s[col])
    plt.title(col)

### Формирование данных для обучения

In [None]:
start_dt = str(df_s.timestamp_value.min())[:10]
end_dt = str(df_s.timestamp_value.max())[:10]
sequence = df_s.values[:, [1, 2, 4, 6]]
dates = df_s['timestamp_value']
days_back = 14
days_fwd = 14
look_back = days_back * 24 * 2
look_fwd = days_fwd * 24 * 2
start_index = sequence.shape[1]
print('start index:', start_index)

In [None]:
shift = -look_fwd
df_s['n_day'] = df_s.shift(periods=shift).timestamp_value.dt.day
df_s['n_week'] = df_s.shift(periods=shift).timestamp_value.dt.week#.astype(np.int8)
df_s['n_month'] = df_s.shift(periods=shift).timestamp_value.dt.month#.astype(np.int8)
df_s['w_day'] = df_s.shift(periods=shift).timestamp_value.dt.weekday#.astype(np.int8)
df_s['is_weekend'] = (df_s['w_day'] >= 5).astype(np.int8)
df_s.head()

In [None]:
def split_sequences_feed(sequence, look_back, look_fwd, start_index):
    X = []
    y = []
    length = sequence.shape[0]
    for start_x in range(length):
        end_x = start_x + look_back
        end_y = end_x + look_fwd
        if end_y > length:
            break
        X.append(sequence[start_x:end_x, :])
        y.append(sequence[end_x:end_y, :start_index])
    X = np.array(X)
    y = np.array(y)
    return X, y

In [None]:
scaler, scaler_pwr = MinMaxScaler(feature_range=(0, 1)), MinMaxScaler(feature_range=(0, 1))
sequence_scaled = scaler_pwr.fit_transform(sequence)
with open(f'{MODEL_PATH}/scaler_pwr.pkl', 'wb') as file:
    pickle.dump(scaler_pwr, file)
print('total elements (pwr):', len(sequence_scaled))
print('one element of channel (pwr):', sequence_scaled[0])

time_features_scaled = scaler.fit_transform(df_s.iloc[:, -5:])
with open(f'{MODEL_PATH}/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print('total elements (days features):', len(time_features_scaled))
print('one element of channel (days features):', time_features_scaled[0])

In [None]:
sequence_scaled_exog = np.hstack((sequence_scaled, time_features_scaled))
print('shape:', sequence_scaled_exog.shape)
print('one row:', sequence_scaled_exog[0])

In [None]:
X, y = split_sequences_feed(sequence_scaled_exog, look_back, look_fwd, start_index)
print('X dataset shape:', X.shape)
print('y dataset shape:', y.shape)

### Обучающая, тестовая и валидационная выборки

In [None]:
def get_train_test(X, y, train_size=.7):
    cut = np.int64(X.shape[0] * train_size)
    X_train = X[:cut]
    X_test = X[cut:]
    y_train = y[:cut]
    y_test = y[cut:]
    return X_train, X_test, y_train, y_test

In [None]:
X_test = X[-1][None]
y_test = y[-1][None]
X_train, X_val, y_train, y_val = get_train_test(X[:-1], y[:-1])
print('train shapes:', X_train.shape, y_train.shape)
print('validation shapes:', X_val.shape, y_val.shape)
print('test shapes:', X_test.shape, y_test.shape)

### Обучение модели

In [None]:
n_features_in = X_train.shape[2]
n_features_out = y_train.shape[2]

In [None]:
def get_model(units, look_back, n_features_in, 
              dropout, r_dropout, stack=False, loss='mse'):
    model = Sequential()
    if stack:
        model.add(LSTM(units=units, 
                      input_shape=(look_back, n_features_in), 
                      activation='relu',
                      return_sequences=True, 
                      dropout=dropout, 
                      recurrent_dropout=r_dropout))
        model.add(LSTM(units=units, 
                      input_shape=(look_back, n_features_in), 
                      activation='relu',
                      return_sequences=True, 
                      dropout=dropout, 
                      recurrent_dropout=r_dropout))
    else:
        model.add(LSTM(units=units, 
                      input_shape=(look_back, n_features_in), 
                      #activation='relu',
                      return_sequences=True, 
                      dropout=dropout, 
                      recurrent_dropout=r_dropout
                     ))
    model.add(TimeDistributed(Dense(n_features_out)))
    optimizer = optimizers.Adam(lr=.001, clipvalue=.5, clipnorm=1)
    model.compile(loss=loss, optimizer=optimizer)
    return model

In [None]:
model = get_model(
    units=1024, 
    look_back=look_back, 
    n_features_in=n_features_in,
    dropout=.25,
    r_dropout=0,
    stack=False, 
    loss='mse'
)
model.summary()

In [None]:
%%time
checkpoint_path = f'{MODEL_PATH}/model.hdf5'
earlystopper = EarlyStopping(
        monitor='val_loss', 
        patience=40, 
        verbose=1,
        mode='min'
)
lrreducer = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=.1, 
    patience=20, 
    verbose=1, 
    min_lr=1e-6,
    mode='min'
)
checkpointer = ModelCheckpoint(
    checkpoint_path, 
    monitor='val_loss', 
    verbose=1, 
    save_best_only=True,
    save_weights_only=True, 
    mode='min'
)
callbacks = [earlystopper, checkpointer, lrreducer]
history = model.fit(
    X_train, 
    y_train, 
    batch_size=256, 
    epochs=1000, 
    verbose=1, 
    validation_data=(X_val, y_val), 
    callbacks=callbacks
)

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

### Оценка результата

In [None]:
def predict(model, X, batch_size=None, n_steps=30 * 24 * 2):
    X_i = X
    y_pred = []
    for i in range(n_steps):
        y_pred_i = model.predict(X_i, batch_size=batch_size)
        y_pred.append(y_pred_i[0])
        X_i = np.hstack((X_i[:, 1:, :], y_pred_i[None, :, :]))
    y_pred = np.array(y_pred)
    return y_pred

In [None]:
y_pred = model.predict(X_test)

In [None]:
plt.figure(figsize=(16, 20))
for i in range(y_test[0].shape[1]):
    plt.subplot(y_test[0].shape[1], 1, i + 1)
    plt.plot(y_test[0][:, i], label='fact')
    plt.plot(y_pred[0][:, i], label='prediction')
    plt.title(df_s.columns[1:][i], loc='right')
    plt.legend()