# Multivariate time series anomaly dtection MVP

In [10]:
from anomaly_detection.evaluate import (
    simple_plots_with_holoviews,
    original_data_plus_detected_anomalies_plot_with_holoviews,
    reconstructed_error_analysis_plot_with_holoviews,
)

from anomaly_detection.get_raw_data import (
    generate_stable_toy_data,
    generate_anomalous_toy_data,
)

from anomaly_detection.train import(
    Autoencoder,
    train_the_model_with_opt_parameters,
    calculate_mean_reconstruction_error_for_a_given_window_size,
)

from anomaly_detection.predict import (
    detect_anomalies_using_trained_model
)
import json

### setup the parameters with json file

In [11]:
# Load the JSON file into a dictionary
with open('parameters_for_toy_data_experiments.json', 'r', encoding='utf-8') as file:
    config_dict= json.load(file)

In [12]:
number_of_rows_for_stable_toy_data = config_dict['number_of_rows_for_stable_toy_data']
seed_normal_dataset = config_dict['seed_normal_dataset']
start_date_normal_dataset = config_dict['start_date_normal_dataset']
window_sizes = config_dict['window_sizes']
variable_of_interest = config_dict['variable_of_interest']
seed_data_to_assess = config_dict['seed_data_to_assess']
number_of_rows_to_assess = config_dict['rows_of_data_to_assess']
start_date_data_to_assess = config_dict['start_date_data_to_assess']
anomaly_indices_spikes = config_dict['anomaly_indices_spikes']
anomaly_indices_drops = config_dict['anomaly_indices_drops']
tolerance_for_anomaly = config_dict['tolerance_for_anomaly']

### Create toy data and toy data with anomaly

In [13]:
df = generate_stable_toy_data(number_of_rows=number_of_rows_for_stable_toy_data,
                          start_date=start_date_normal_dataset,
                          seed_for_random=seed_normal_dataset)

In [14]:
plot = simple_plots_with_holoviews(df=df, save_plot=True,)
plot

{'pH': :Curve   [Time]   (pH),
 'salinity': :Curve   [Time]   (salinity),
 'temperature': :Curve   [Time]   (temperature)}

In [15]:

df_anomalous, anomalies_df, num_implemented_anomalies = generate_anomalous_toy_data(
                                                                    number_of_rows=number_of_rows_to_assess,
                                                                    start_date=start_date_data_to_assess,
                                                                    anomaly_indices_spikes=anomaly_indices_spikes,
                                                                    anomaly_indices_drops=anomaly_indices_drops,
                                                                    seed_for_random=seed_data_to_assess)

In [16]:
plot2 = original_data_plus_detected_anomalies_plot_with_holoviews(df=df_anomalous,
                       anomalies_df=anomalies_df,
                       variable_of_interest=variable_of_interest,
                       save_plot=True,
                       name_of_the_plot='created_anomalies')
plot2

### Evaluate the best window size

In [17]:
# Evaluate each window size
results = {}
for window_size in window_sizes:
    mean_error = calculate_mean_reconstruction_error_for_a_given_window_size(df, window_size)
    results[window_size] = mean_error
    print(f'Window size: {window_size}, Mean Reconstruction Error: {mean_error}')

# Choose the window size with the lowest mean reconstruction error
best_window_size = min(results, key=results.get)
print(f'Best window size: {best_window_size}')

input dimension:  15
Window size: 5, Mean Reconstruction Error: 0.48864665627479553
input dimension:  30
Window size: 10, Mean Reconstruction Error: 0.4899151623249054
input dimension:  60
Window size: 20, Mean Reconstruction Error: 0.4854147732257843
input dimension:  90
Window size: 30, Mean Reconstruction Error: 0.5139358043670654
input dimension:  105
Window size: 35, Mean Reconstruction Error: 0.5114824175834656
input dimension:  120
Window size: 40, Mean Reconstruction Error: 0.5391653180122375
input dimension:  135
Window size: 45, Mean Reconstruction Error: 0.4723319709300995
input dimension:  150
Window size: 50, Mean Reconstruction Error: 0.5256071090698242
Best window size: 45


### Use the best window size to train the model & plot the reconstruction errors

In [18]:
(trained_model,
 reconstructed_data_training,
 avg_reconstruction_error_training,
 std_reconstruction_error_training,
 upper_threshold_training,) = train_the_model_with_opt_parameters(df=df,
                                              window_size=best_window_size,
                                              variable_of_interest=variable_of_interest,
                                              num_epochs=200,
                                              learning_rate=0.001,
                                              tolerance_for_anomaly= tolerance_for_anomaly
                                              )


Epoch 1, Loss: 558.6651000976562
Epoch 11, Loss: 448.77850341796875
Epoch 21, Loss: 155.63632202148438
Epoch 31, Loss: 27.3554744720459
Epoch 41, Loss: 8.912715911865234
Epoch 51, Loss: 3.6110165119171143
Epoch 61, Loss: 1.6352757215499878
Epoch 71, Loss: 0.9833452701568604
Epoch 81, Loss: 0.6513205170631409
Epoch 91, Loss: 0.5734989047050476
Epoch 101, Loss: 0.5265347361564636
Epoch 111, Loss: 0.5121380090713501
Epoch 121, Loss: 0.5086606740951538
Epoch 131, Loss: 0.5068504214286804
Epoch 141, Loss: 0.5061262845993042
Epoch 151, Loss: 0.5059754848480225
Epoch 161, Loss: 0.5058771371841431
Epoch 171, Loss: 0.5058557987213135
Epoch 181, Loss: 0.5058409571647644
Epoch 191, Loss: 0.5058358907699585
Average Reconstruction Error for pH: 0.4965729277251496
Standard Deviation of Reconstruction Error for pH: 0.9538904368048806
upper threshold for pH: 6.219915548554433


In [19]:
# Example usage:
# Assuming you have the DataFrame 'reconstructed_data_training' and the error statistics calculated
plot = reconstructed_error_analysis_plot_with_holoviews(error_df=reconstructed_data_training, 
                            variable_of_interest=variable_of_interest, 
                            avg_error=avg_reconstruction_error_training,
                            upper_threshold=upper_threshold_training,
                            save_plot=True,
                            name_of_the_plot='training_data')
plot

In [20]:
# df_anomalous, anomalies_df, num_implemented_anomalies = generate_anomalous_toy_data(
#                                                                     number_of_rows=number_of_rows_to_assess,
#                                                                     start_date=start_date_data_to_assess,
#                                                                     anomaly_indices_spikes=anomaly_indices_spikes,
#                                                                     anomaly_indices_drops=anomaly_indices_drops,
#                                                                     seed_for_random=seed_data_to_assess)

In [21]:
# plot2 = original_data_plus_detected_anomalies_plot_with_holoviews(df=df_anomalous,
#                        anomalies_df=anomalies_df,
#                        variable_of_interest=variable_of_interest,
#                        save_plot=True,
#                        name_of_the_plot='created_anomalies')
# plot2

### Use the trained model and detect the anomalies & plot reconstruction errors

In [22]:
# Detect anomalies with the new method
print (trained_model)
# print(reconstruction_errors_training)
print(avg_reconstruction_error_training)
print(upper_threshold_training)
# print(lower_treshold_training)
# print(reconstructed_data_training)

(reconstructed_df_test,
 anomalies_df_test,
 num_detected_anomalies_test ) = detect_anomalies_using_trained_model(df_anomalous,
                                         trained_model,
                                         best_window_size,
                                         variable_of_interest,
                                         upper_threshold_training)

Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=135, out_features=67, bias=True)
    (1): ReLU()
    (2): Linear(in_features=67, out_features=33, bias=True)
    (3): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=33, out_features=67, bias=True)
    (1): ReLU()
    (2): Linear(in_features=67, out_features=135, bias=True)
  )
)
0.4965729277251496
6.219915548554433


In [23]:
# Assuming you have the DataFrame 'reconstructed_data_training' and the error statistics calculated
plot = reconstructed_error_analysis_plot_with_holoviews(error_df=reconstructed_df_test,
                            variable_of_interest=variable_of_interest,
                            avg_error=avg_reconstruction_error_training,
                            upper_threshold=upper_threshold_training,
                            save_plot=True,
                            name_of_the_plot='training_dataset')
plot

In [24]:
# print(reconstruction_errors_training)
print(avg_reconstruction_error_training)
print(upper_threshold_training)
#  print(lower_treshold_training)

0.4965729277251496
6.219915548554433


In [25]:
plot2 = original_data_plus_detected_anomalies_plot_with_holoviews(df=df_anomalous,
                       anomalies_df=anomalies_df_test,
                       variable_of_interest=variable_of_interest,
                       save_plot=True,
                       name_of_the_plot='test_dateset')
plot2