> Copyright 2022 University of Luxembourg
> 
> Licensed under the Apache License, Version 2.0 (the "License");  
> you may not use this file except in compliance with the License.  
> You may obtain a copy of the License at  
>
>    https://www.apache.org/licenses/LICENSE-2.0
>
> Unless required by applicable law or agreed to in writing, software  
> distributed under the License is distributed on an "AS IS" BASIS,  
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
> See the License for the specific language governing permissions and  
> limitations under the License.  
>
***

Author: Andrzej Mizera (andrzej.mizera@uni.lu)

***

# Called for executing one iteration of the leave-one-out validation procedure performed to detemine the Euclidean and Mahalanobis error threshold values.

In [None]:
islc_norm_list = []
df_values_list = []
feature_norm_values_list = []

### Iterate over the datasets used for model training.
for ds in datasets:

    islc_norm = getIterationsSinceLastChangeMicro(ds.dataframe,normal_features)
    islc_norm_list.append(islc_norm)
    
    df_values = pd.DataFrame(islc_norm)
    df_values_list.append(df_values)
    
    feature_norm_values = ds.dataframe[normal_features].astype('float32').values
    feature_norm_values_list.append(feature_norm_values)

### Leave-one-out normal dataset used for generating the error samples.
islc_valid = getIterationsSinceLastChangeMicro(validation_df,normal_features)
df_valid_values = pd.DataFrame(islc_valid)
feature_valid_values = validation_df[normal_features].astype('float32').values

---

### Preparation of the data for training and testing

In [None]:
try:
    del X, XX
except NameError:
    pass

In [None]:
if (model_name == 'CNN') or (model_name == 'LSTM') or (model_name == 'LSTM_2') or (model_name == 'AutoEncoder'):

    for i in range(len(df_values_list)):
        df_values = df_values_list[i]
        temps_normal_values = feature_norm_values_list[i]

        if (model_name == 'AutoEncoder'):
            n_features_in, n_features_out, X_part, XX_part = create_subseq_AE(np.append(df_values,temps_normal_values,axis=1), window_length)
        else:
            n_features_in, n_features_out, X_part, XX_part = create_subseq(np.append(df_values,temps_normal_values,axis=1), np.append(df_values,temps_normal_values,axis=1), window_length, 1)

        try:
            X = np.append(X,X_part,axis=0)
            XX = np.append(XX,XX_part,axis=0)
        except NameError:
            X = X_part
            XX = XX_part

    if (model_name == 'AutoEncoder'):
        _, _, X_valid, Y_valid = create_subseq_AE(np.append(df_valid_values,feature_valid_values,axis=1), window_length)
    else:
        _, _, X_valid, Y_valid = create_subseq(np.append(df_valid_values,feature_valid_values,axis=1), np.append(df_valid_values,feature_valid_values,axis=1), window_length, 1)

else:
    raise ValueError('Wrong model name!')
    
print('Model type:', model_name)
print('Number of input features:', n_features_in)
print('Number of output features:', n_features_out)

---
### Model construction and training

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=1e-2, patience=10, verbose=0, mode='auto',
    baseline=None, restore_best_weights=True)

In [None]:
%run aux_notebooks/models.ipynb

#### Model training

In [None]:
### Define the Keras TensorBoard callback.
#logdir=os.path.join(output_folder,logs,fit,datetime.now().strftime("%Y%m%d-%H%M%S"))
#tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

### Choose which loss function to use ###
### --------------------------------- ###
model.compile(loss="mse",optimizer='adam')
#model.compile(loss=loss_fun,optimizer='adam')
    
model.build(input_shape=(None, window_length, n_features_in))
model.summary()

#cbs = [early_stop,tensorboard_callback]
cbs = [early_stop]
    
history=model.fit(x=X, y=np.squeeze(XX), 
                  validation_data=(np.array(X_valid), np.squeeze(np.array(Y_valid))),
                  epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=cbs
                 )

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
# Load the TensorBoard notebook extension.
#%load_ext tensorboard

#%tensorboard --logdir logs

### Computation of the prediction errors on the training data subset, i.e., train errors.

In [None]:
pred = model(X)

# If only one time point ahead is predicted, XX is of dimension (num_points, 1, n_features). However, pred is
# then of dimension (num_points, n_features). Therefore, the shape of pred needs to be expanded:
pred_np = pred.numpy()
if (len(pred_np.shape) == 2):
    pred_np = np.expand_dims(pred_np,axis=1)
    
te = pred_np - XX

train_errors = np.reshape(te,(te.shape[0],te.shape[1]*te.shape[2]))

### Computation of the prediction errors on the leave-one-out dataset.

In [None]:
cet_valid = validation_df.index

In [None]:
pred = model(np.array(X_valid))
    
pred_np = pred.numpy()
if (len(pred_np.shape) == 2):
    pred_np = np.expand_dims(pred_np,axis=1)
    
te = pred_np - np.array(Y_valid)

valid_errors = np.reshape(te,(te.shape[0],te.shape[1]*te.shape[2]))

In [None]:
padding = np.zeros(window_length)

def score(x):
    if not kPCA:
        pca_variances = pca.explained_variance_[first_higher_order_PCA:]
    else:
        pca_variances = pca.lambdas_[first_higher_order_PCA:]

    return sum(np.divide(np.square(x),pca_variances))

def vec_length(x):
    return np.sqrt(sum(np.square(x)))

if PCA_higher_order_analysis:
    error_fun = score
else:
    error_fun = vec_length

In [None]:
figure(figsize=(20, 7))
ax1 = plt.gca()
if DeltaLastChangeTimes_analysis:
    ax1.plot(cet_valid,
             np.concatenate((padding,np.apply_along_axis(error_fun, 1, valid_errors))),
             color='r', label='Euclidean error'
            )
    ax2=ax1.twinx()
    ax2.plot(validation_df_full[normal_features])
    ax2.set_ylim([30, 45])
else:
    plt.plot(np.concatenate((padding,np.apply_along_axis(error_fun, 1, valid_errors))),
             color='r', label='Euclidean error'
            )

plt.title('Leave-one-out dataset: ' + validation_dataset_date)
plt.xlabel('Timestamp')
plt.ylabel('Euclidean error')

plt.show()

In [None]:
validation_error_data = np.apply_along_axis(error_fun, 1, valid_errors)

### Estimation of the Gaussian distribution on the train errors for Mahalanobis distance computations.

In [None]:
mean = sum(train_errors)/len(train_errors)

cov = 0
for e in train_errors:
    cov += np.dot((e-mean).reshape(len(e), 1), (e-mean).reshape(1, len(e)))
cov /= len(train_errors)

In [None]:
# calculate Mahalanobis distance
def Mahala_distance(x,mean,cov):

    d = np.dot(x-mean,np.linalg.inv(cov))

    d = np.dot(d, (x-mean).T)

    return d

### Compute the Mahalanobis distances for the prediction errors on the validation data subset.

In [None]:
m_dist = [0]*window_length 
for e in valid_errors:
    m_dist.append(Mahala_distance(e,mean,cov))

### Plot the Mahalanobis distance for the validation dataset predictions.

In [None]:
fig = figure(figsize=(20, 7))
ax1=plt.gca()
if DeltaLastChangeTimes_analysis:
    ax1.plot(cet_valid,m_dist, color='r', label='Mahalanobis Distance')
    ax2=ax1.twinx()
    ax2.plot(validation_df_full[normal_features])
    ax2.set_ylim([30, 45])
else:
    plt.plot(m_dist, color='r', label='Mahalanobis Distance')

plt.title('Leave-one-out dataset: ' + validation_dataset_date)
plt.xlabel('Timestamp')
plt.ylabel('Mahalanobis error')

plt.show()

In [None]:
mahalanobis_validation_error_data = m_dist[window_length:]