In [1]:
import numpy as np
import pandas as pd
from typing import Optional
from datetime import datetime, timezone
import logging

import argparse
import os
import warnings
from typing import Optional, Literal, NewType
import json


# Get the logger for this module
logger = logging.getLogger(__name__)


In [2]:
from anomaly_detection_training_module_v1 import timestamp_for_this_experiment # get global variable from __init__.py


2024-08-10 20:28:34 - INFO - Logging is set up correctly.


In [3]:
def get_config_path():
    # Check if the environment variable is set
    env_path = os.getenv('PATH_TO_THE_CONFIGURATION_FILE')
    
    if env_path:
        return env_path
    
    # If not, parse the command-line arguments
    parser = argparse.ArgumentParser(description='Provide the path to the configuration file.')
    parser.add_argument('--config', type=str, help='Path to the configuration file')
    args = parser.parse_args()
    
    if args.config:
        return args.config
    else:
        logging.error("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")
        
        raise ValueError("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")

In [4]:
# get the path to the .json file from the environment

path_for_the_json_file = get_config_path()

In [5]:
path_for_the_json_file

'/home/aldo/Repositories/general_projects/anomaly_detection_training_module_v1/notebooks/parameters_for_toy_data_experiments.json'

In [6]:
def load_and_process_params(file_path: str) -> tuple:
    # Load parameters from JSON file
    with open(file_path, "r") as file:
        params = json.load(file)


    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    start_date_for_the_toy_dataset = datetime.fromisoformat(
    params["parameters_to_create_toy_data"]["start_date_for_the_toy_dataset"].replace("Z", "+00:00"))
    # Display parameter
    logging.info("start_date_for_the_normal_dataset:")
    logging.info(start_date_for_the_toy_dataset)

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    seed_for_the_stable_dataset = params ["parameters_to_create_toy_data"]["seed_for_the_stable_dataset"]
    # Display window sizes
    logging.info("seed_for_the_stable_dataset:")
    logging.info(seed_for_the_stable_dataset)

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    number_of_rows_for_stable_toy_data = params ["parameters_to_create_toy_data"]["number_of_rows_for_stable_toy_data"]
    # Display window sizes
    logging.info("number_of_rows_for_stable_toy_data:")
    logging.info(number_of_rows_for_stable_toy_data)

    return (
        start_date_for_the_toy_dataset,
        number_of_rows_for_stable_toy_data,
        seed_for_the_stable_dataset
    )


In [9]:
(
    start_date_for_the_toy_dataset,
    number_of_rows_for_stable_toy_data,
    seed_for_the_stable_dataset
    ) = load_and_process_params(path_for_the_json_file)

2024-08-10 21:15:52 - INFO - start_date_for_the_normal_dataset:
2024-08-10 21:15:52 - INFO - 2024-08-10 15:00:00
2024-08-10 21:15:52 - INFO - seed_for_the_stable_dataset:
2024-08-10 21:15:52 - INFO - 300
2024-08-10 21:15:52 - INFO - number_of_rows_for_stable_toy_data:
2024-08-10 21:15:52 - INFO - 10000


In [19]:
import numpy as np
import pandas as pd

def generate_stable_toy_data(number_of_rows: int, start_date: str, seed_for_random: int = None) -> pd.DataFrame:
    # Set the seed for reproducibility
    if seed_for_random is not None:
        np.random.seed(seed_for_random)
    
    # Generate a date range
    date_range = pd.date_range(start=start_date, periods=number_of_rows, freq='5min', tz='UTC')
    
    # Generate base data with correlations
    # Temperature: Normally distributed around 75°C with small fluctuations
    temperature = np.random.normal(loc=75, scale=1, size=number_of_rows)
    
    # Pressure: Correlated with temperature, slightly decreasing with higher temperatures
    pressure = 3 - 0.01 * (temperature - 75) + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
    
    # Flow Rate: Generally stable, slightly increasing with lower pressure (inverse correlation)
    flow_rate = 300 + 10 * (3 - pressure) + np.random.normal(loc=0, scale=5, size=number_of_rows)
    
    # Vibration: Non-linear increase with flow_rate and pressure
    vibration = 0.1 * np.sqrt(flow_rate * pressure) + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
    
    # Humidity: Independent of the other variables, normal fluctuations
    humidity = np.random.normal(loc=40, scale=5, size=number_of_rows)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Timestamp': date_range,
        'Temperature_C': temperature,
        'Pressure_MPa': pressure,
        'Vibration_mm_s': vibration,
        'Flow_Rate_l_min': flow_rate,
        'Humidity_%': humidity
    })
    
    # Set Timestamp as the index
    df.set_index('Timestamp', inplace=True)
    
    return df


In [20]:
# Example usage
df_stable = generate_stable_toy_data(number_of_rows=number_of_rows_for_stable_toy_data, start_date=start_date_for_the_toy_dataset, seed_for_random=42)

df_stable.head()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-08-10 15:00:00+00:00,75.496714,2.961108,2.892026,302.13035,37.640712
2024-08-10 15:05:00+00:00,74.861736,2.986108,2.948046,301.555541,45.063512
2024-08-10 15:10:00+00:00,75.647689,2.963654,2.930878,295.68086,39.009066
2024-08-10 15:15:00+00:00,76.52303,2.990291,3.017539,302.995015,40.452846
2024-08-10 15:20:00+00:00,74.765847,3.0622,3.041092,291.927582,43.586953


In [21]:
df_stable.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 07:55:00+00:00,76.301102,3.00638,3.044368,297.049757,40.283995
2024-09-14 08:00:00+00:00,73.001655,3.088852,3.089324,306.334502,39.875386
2024-09-14 08:05:00+00:00,74.294683,3.025971,3.029858,297.233455,42.500424
2024-09-14 08:10:00+00:00,75.495766,3.080719,3.028339,289.731028,41.326077
2024-09-14 08:15:00+00:00,75.644388,2.91256,2.972425,299.729978,47.579055


### create the two types of anomaly to evalaute it

Visualization of this data

Problem 1: Bearing Wear
Description: Over time, the bearings in the pump might wear out, causing an increase in vibration levels.


Problem 5: Broken Temperature Sensor
Description: The temperature sensor might malfunction or break, leading to inaccurate or stuck readings.