# Applying the MVP code into real data

In [1]:
from pathlib import Path
from anomaly_detection.get_raw_data import get_raw_data
from anomaly_detection.helpers import (
    get_postgre_credentials_and_string_from_env_variables,
    save_dictionary_of_dataframes_to_parquet,
    load_and_process_params,
    adjusting_the_name_of_the_variable_of_interest,
    save_dataframes_to_parquet,
    save_model,
    get_month_year_string,
)
from anomaly_detection.process_data import (
    slice_dataframe_by_date,
    normalize_data,
    combine_ro_dataframes_into_single_dataframe,
    combine_and_slice_plost_to_support_final_analyses,
)
from anomaly_detection.evaluate import (
    overlay_plots_with_holoviews,
    reconstructed_error_analysis_plot_with_holoviews,
)

from anomaly_detection.train import (
    find_best_window_size,
    train_the_model_with_opt_parameters,
    calculate_mean_reconstruction_error_for_a_given_window_size,
    objective,
    train_the_model_with_optimized_hyperparameters,
)

from anomaly_detection.predict import detect_anomalies_using_trained_model
import pandas as pd
import datetime

from typing import (
    NewType,
)
import optuna
from optuna.trial import TrialState
from datetime import datetime, timezone
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

Col = NewType("Col", str)
Table = NewType("Table", str)
Param = NewType("Param", str)
ParamMap = NewType("ParamMap", dict[Param, tuple[Table, Col]])
Query = NewType("Query", str)
Train = str
Filename = str

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 100)
pd.set_option("display.min_rows", 100)
pd.set_option("display.expand_frame_repr", True)

## 1- read the parameters (json file)

In [2]:
# Example of how to call the function and use the results
file_path = "parameters_for_real_data_experiments_v2.json"  # Path to your JSON file
(
    train_param_map,
    common_param_map,
    train_format_args,
    variable_of_interest,
    tolerance_for_anomaly,
    train_name,
    start_timestamp,
    end_timestamp,
    start_timestamp_for_training,
    end_timestamp_for_training,
    path_to_save_the_outcomes,
    number_of_chunks,
    num_of_layers,
    window_sizes,
    num_of_epoches,
    lr,
) = load_and_process_params(file_path)

2024-07-30 18:57:09 - INFO - Train Parameter Map:
2024-07-30 18:57:09 - INFO - permeate_flow: Table -> raw_ro{X}_data, Column -> RO_3{Y}_FIT_6{X}1_outPV_Actual
2024-07-30 18:57:09 - INFO - silica_saturation_index: Table -> ai_sbro_optout, Column -> ROi_Train_{X}_sat_sio2_index
2024-07-30 18:57:09 - INFO - calcium_fluoride_saturation_index: Table -> ai_sbro_optout, Column -> ROi_Train_{X}_sat_caf2_index
2024-07-30 18:57:09 - INFO - feed_flow: Table -> raw_rofeed_data, Column -> PMP_6{X}1_FIT_{X}11_outPV_Actual
2024-07-30 18:57:09 - INFO - permeate_conductivity: Table -> raw_romain_data, Column -> RO_3{Y}_AIT_6{X}1_outPV_Actual
2024-07-30 18:57:09 - INFO - Common Parameter Map:
2024-07-30 18:57:09 - INFO - feed_ph_1: Table -> raw_romain_data, Column -> FLTR_601_AIT_01_outPV_Actual
2024-07-30 18:57:09 - INFO - feed_silica: Table -> raw_romain_data, Column -> FLTR_601_AIT_006_outPV_Actual
2024-07-30 18:57:09 - INFO - feed_calcium: Table -> raw_romain_data, Column -> FLTR_601_AIT_008_outPV_

### adjust the variable_of_interest name

In [3]:
variable_of_interest = adjusting_the_name_of_the_variable_of_interest(
    variable_of_interest=variable_of_interest,
    train_param_map=train_param_map,
    train_name=train_name,
)

### creating a time-stamp for this experiment

In [4]:
# Generate current timestamp
timestamp_for_this_experiment = datetime.now().strftime("%Y%m%d_%H%M%S")

## 2- download data to train and test the model

Remember to properly set the credentials to access Icarus on your env:

In [5]:
(
    credentials_dict,
    credentials_string,
) = get_postgre_credentials_and_string_from_env_variables()

In [6]:
data = get_raw_data(
    start_date=start_timestamp,
    end_date=end_timestamp,
    postgres_dsn_string=credentials_string,
    train_param_map=train_param_map,
    common_param_map=common_param_map,
    train_format_args=train_format_args,
)

2024-07-30 18:57:09 - INFO - Using the following datetime range to filter data within postgreSQL: from 2024-01-01 00:00:00+00:00 to 2024-01-31 23:59:59+00:00
2024-07-30 18:57:09 - INFO - Connecting to the database using the postgre string
2024-07-30 18:57:10 - INFO - Querying database table [1/6]
2024-07-30 18:57:20 - INFO - Finished querying database table [1/6]
2024-07-30 18:57:20 - INFO - Querying database table [2/6]
2024-07-30 18:57:20 - INFO - Finished querying database table [2/6]
2024-07-30 18:57:20 - INFO - Querying database table [3/6]
2024-07-30 18:57:30 - INFO - Finished querying database table [3/6]
2024-07-30 18:57:30 - INFO - Querying database table [4/6]
2024-07-30 18:57:44 - INFO - Finished querying database table [4/6]
2024-07-30 18:57:44 - INFO - Querying database table [5/6]
2024-07-30 18:57:54 - INFO - Finished querying database table [5/6]
2024-07-30 18:57:54 - INFO - Querying database table [6/6]
2024-07-30 18:58:04 - INFO - Finished querying database table [6/6]

### saving data for future reference

In [7]:
variable_names = list(data.keys())
base_path = f"{path_to_save_the_outcomes}/tmp/data/{variable_of_interest}/"
month_year_string = get_month_year_string(start_timestamp)

save_dictionary_of_dataframes_to_parquet(
    data=data,
    variable_names=variable_names,
    base_path=base_path,
    specific_suffix=month_year_string,
    time_stemp=timestamp_for_this_experiment,
)

2024-07-30 18:58:05 - INFO - Saved /workspace/synauta-ml-ds/projects/turing_ds_anomaly_detection_mvp/exploration/tmp/data/RO1_permeate_conductivity/df_ro1_january_2024_20240730_185709.parquet


# Step 3: train and test the model for train 1

### merging all the trains


In [8]:
# define the train parameters

list_of_train_paramenters = list(train_param_map.keys())

combined_df_with_all_trains_info = combine_ro_dataframes_into_single_dataframe(
    df_dict=data, list_of_train_parameters=list_of_train_paramenters
)

### select the data that is stable/normal

In [9]:
data_to_train = slice_dataframe_by_date(
    df=combined_df_with_all_trains_info,
    start_date=start_timestamp_for_training,
    end_date=end_timestamp_for_training,
)

In [10]:
base_path = f"{path_to_save_the_outcomes}/tmp/normal_data_to_train_the_model/{variable_of_interest}/{timestamp_for_this_experiment}/training_data/"
specific_suffix = f"{variable_of_interest}_jan_2024"

save_dataframes_to_parquet(
    data_frame=data_to_train,
    base_path=base_path,
    specific_suffix=specific_suffix,
)

2024-07-30 18:58:05 - INFO - Saved DataFrame to /workspace/synauta-ml-ds/projects/turing_ds_anomaly_detection_mvp/exploration/tmp/normal_data_to_train_the_model/RO1_permeate_conductivity/20240730_185709/training_data/df_RO1_permeate_conductivity_jan_2024.parquet


#### visualize the standard/normal snippet of data

In [11]:
plots_05 = overlay_plots_with_holoviews(
    df=data_to_train,
    save_plot=True,
    save_path=f"{path_to_save_the_outcomes}/tmp/normal_data_to_train_the_model/{variable_of_interest}/{timestamp_for_this_experiment}/overlayed_plots_original_data/",
)

normalized_data_for_plots_2, min_max_values_for_plots_2 = normalize_data(data_to_train)

plots_06 = overlay_plots_with_holoviews(
    df=normalized_data_for_plots_2,
    save_plot=True,
    save_path=f"{path_to_save_the_outcomes}/tmp/normal_data_to_train_the_model/{variable_of_interest}/{timestamp_for_this_experiment}/overlayed_plots_norm_data/",
)

## hyperparameters optimization

In [12]:
# Define Optuna study
study = optuna.create_study(direction="minimize")

# Optimize the objective function
study.optimize(lambda trial: objective(trial, df_to_train=data_to_train,
                                       num_of_layers= num_of_layers,
                                       window_sizes=window_sizes, num_of_epoches=num_of_epoches,
                                       lr=lr
                                       ), n_trials=200)

[I 2024-07-30 18:58:10,287] A new study created in memory with name: no-name-5f427efb-bbbb-4ba6-a063-c5ff1e1a09ea
2024-07-30 18:58:10 - INFO - Starting off new loop to (re)train the model:
2024-07-30 18:58:10 - INFO - Create samples with the specified window size:
2024-07-30 18:58:10 - INFO - Build the autoencoder model:
2024-07-30 18:58:11 - INFO - Train the model:
2024-07-30 18:58:24 - INFO - valuate on validation set:
2024-07-30 18:58:24 - INFO - Calculate reconstruction error:
2024-07-30 18:58:24 - INFO - Calculate the mean reconstruction error:
[I 2024-07-30 18:58:24,629] Trial 0 finished with value: 10.06355094909668 and parameters: {'window_sizes': 95, 'num_of_epochs': 553, 'lr': 0.0010746089481287128, 'num_of_layers': 1}. Best is trial 0 with value: 10.06355094909668.
2024-07-30 18:58:24 - INFO - Starting off new loop to (re)train the model:
2024-07-30 18:58:24 - INFO - Create samples with the specified window size:
2024-07-30 18:58:24 - INFO - Build the autoencoder model:
2024

### checking the outcomes

In [13]:
# evauation
pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Study statistics: 
  Number of finished trials:  200
  Number of pruned trials:  33
  Number of complete trials:  167
Best trial:
  Value:  0.568994402885437
  Params: 
    window_sizes: 3
    num_of_epochs: 861
    lr: 0.03777256269175889
    num_of_layers: 1


In [14]:
best_window_size = trial.params['window_sizes']
best_num_of_epochs = trial.params['num_of_epochs']
best_learning_rate = trial.params['lr']
best_num_of_layers = trial.params["num_of_layers"]

### train the algorithm with the best window

In [23]:
(
    trained_model,
    reconstructed_data_training,
    avg_reconstruction_error_training,
    std_reconstruction_error_training,
    upper_threshold_training,
) = train_the_model_with_optimized_hyperparameters(
    df=data_to_train,
    num_of_layers=best_num_of_layers,
    window_size=best_window_size,
    variable_of_interest=variable_of_interest,
    num_epochs=best_num_of_epochs,
    learning_rate=best_learning_rate,
    tolerance_for_anomaly=tolerance_for_anomaly,
)

2024-07-30 19:23:05 - INFO - Inicializing the training process with the best selected windows size: 3 
2024-07-30 19:23:05 - INFO - building the autoencoder models
2024-07-30 19:23:05 - INFO - training the model: 
2024-07-30 19:23:05 - INFO - Epoch 1, Loss: 4457.53857421875
2024-07-30 19:23:06 - INFO - Epoch 11, Loss: 687.0089111328125
2024-07-30 19:23:06 - INFO - Epoch 21, Loss: 231.0187530517578
2024-07-30 19:23:06 - INFO - Epoch 31, Loss: 74.55015563964844
2024-07-30 19:23:06 - INFO - Epoch 41, Loss: 45.93355941772461
2024-07-30 19:23:06 - INFO - Epoch 51, Loss: 32.90602493286133
2024-07-30 19:23:06 - INFO - Epoch 61, Loss: 28.202293395996094
2024-07-30 19:23:06 - INFO - Epoch 71, Loss: 26.188955307006836
2024-07-30 19:23:06 - INFO - Epoch 81, Loss: 24.795394897460938
2024-07-30 19:23:06 - INFO - Epoch 91, Loss: 23.860828399658203
2024-07-30 19:23:06 - INFO - Epoch 101, Loss: 22.850914001464844
2024-07-30 19:23:06 - INFO - Epoch 111, Loss: 21.722814559936523
2024-07-30 19:23:06 - IN

### saving the model

In [24]:
base_path = f"{path_to_save_the_outcomes}/tmp/trained_model/{variable_of_interest}/{timestamp_for_this_experiment}"

additional_info = {
    "model_type": "Autoencoder",
    "target_sensor": f"{variable_of_interest}",
    "data_used_to_train_the_model": f"from {start_timestamp_for_training} to {end_timestamp_for_training}",
    "path_where_it_is_saved": f"tmp/normal_data_to_train_the_model/{variable_of_interest}/{timestamp_for_this_experiment}/training_data/",
    "average_reconstruction_error_for_this_model_in_training": trial.value,
    "best_window_size": trial.params['window_sizes'],
    "best_number_of_epoches": trial.params['num_of_epochs'],
    "best_learinng_rate": trial.params['lr'],
    "threshold_for_the_reconstruction_error": upper_threshold_training,
}
save_model(trained_model, base_path, additional_info)

### test the algorithm in real data

In [26]:
(
    reconstructed_df_test,
    anomalies_df_test,
    num_detected_anomalies_test,
) = detect_anomalies_using_trained_model(
    combined_df_with_all_trains_info,
    trained_model,
    best_window_size,
    variable_of_interest,
    upper_threshold_training,
)

2024-07-30 19:23:11 - INFO - inicializing the anomaly detection process with the trained model
2024-07-30 19:23:25 - INFO - calculating the reconstruction error
2024-07-30 19:23:25 - INFO - total points of data to be analyzed for the variable of interest: 892800
2024-07-30 19:23:25 - INFO - total of anomalyes found: 1779
2024-07-30 19:23:25 - INFO - percentage of anomalies in this sensor: 0.19926075268817206 % 


### total of anomalies

In [27]:
anomalies_df_test.head()

Unnamed: 0,RO1_permeate_flow,RO1_silica_saturation_index,RO1_calcium_fluoride_saturation_index,RO1_feed_flow,RO1_permeate_conductivity,feed_ph_1,feed_silica,feed_calcium,feed_temperature,feed_orp_1,RO1_permeate_conductivity_error
2024-01-02 09:39:27+00:00,2.530549,-2.026058,-13.376057,-2.041688,81.164505,6.354158,7.843384,0.275684,29.51561,162.629257,1490.583996
2024-01-02 09:39:30+00:00,2.599676,-1.728413,-13.534751,-1.808773,81.113274,6.474095,7.808065,0.160919,29.275845,162.61496,1074.809253
2024-01-02 09:39:33+00:00,2.934308,-1.83459,-13.342348,-1.618228,80.545891,6.754933,8.367941,-0.089654,29.186785,162.852966,5663.406715
2024-01-02 17:03:09+00:00,20.678276,-2.507674,-8.263836,24.669945,146.22876,5.950719,6.598751,0.737161,34.503563,185.09404,725.560258
2024-01-02 17:03:15+00:00,21.859472,-1.462742,-8.307275,24.41905,142.714447,8.184243,9.679292,-1.517392,32.317673,186.395874,2187.209624


# Step 4: evaluate the outcomes - plotting the results

### visualize the reconstruction errors and the real plot with the anomalies

In [28]:
# Assuming you have the DataFrame 'reconstructed_data_training' and the error statistics calculated
plot = reconstructed_error_analysis_plot_with_holoviews(
    error_df=reconstructed_df_test,
    variable_of_interest=variable_of_interest,
    avg_error=avg_reconstruction_error_training,
    upper_threshold=upper_threshold_training,
    save_plot=True,
    name_of_the_plot="ro1_real_data",
    save_path=f"{path_to_save_the_outcomes}/tmp/outcomes_from_the_model/{variable_of_interest}/{timestamp_for_this_experiment}/reconstruction_error_analysis/",
)

### combine all data and the sppoted anomalies - plots for final analyses

In [29]:
combine_and_slice_plost_to_support_final_analyses(
    combined_df_with_all_trains_info=combined_df_with_all_trains_info,
    anomalies_df_test=anomalies_df_test,
    variable_of_interest=variable_of_interest,
    path_to_save_the_outcomes=path_to_save_the_outcomes,
    timestamp_for_this_experiment=timestamp_for_this_experiment,
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
    number_of_chunks=number_of_chunks,
)