In [1]:
import numpy as np
import pandas as pd
from typing import Optional
from datetime import datetime, timezone
import logging

import argparse
import os
import warnings
from typing import Optional, Literal, NewType
import json


# Get the logger for this module
logger = logging.getLogger(__name__)


In [2]:
from anomaly_detection_training_module_v1 import timestamp_for_this_experiment # get global variable from __init__.py


2024-08-10 20:28:34 - INFO - Logging is set up correctly.


In [3]:
def get_config_path():
    # Check if the environment variable is set
    env_path = os.getenv('PATH_TO_THE_CONFIGURATION_FILE')
    
    if env_path:
        return env_path
    
    # If not, parse the command-line arguments
    parser = argparse.ArgumentParser(description='Provide the path to the configuration file.')
    parser.add_argument('--config', type=str, help='Path to the configuration file')
    args = parser.parse_args()
    
    if args.config:
        return args.config
    else:
        logging.error("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")
        
        raise ValueError("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")

In [4]:
# get the path to the .json file from the environment

path_for_the_json_file = get_config_path()

In [5]:
path_for_the_json_file

'/home/aldo/Repositories/general_projects/anomaly_detection_training_module_v1/notebooks/parameters_for_toy_data_experiments.json'

In [6]:
def load_and_process_params(file_path: str) -> tuple:
    # Load parameters from JSON file
    with open(file_path, "r") as file:
        params = json.load(file)


    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    start_date_for_the_toy_dataset = datetime.fromisoformat(
    params["parameters_to_create_toy_data"]["start_date_for_the_toy_dataset"].replace("Z", "+00:00"))
    # Display parameter
    logging.info("start_date_for_the_normal_dataset:")
    logging.info(start_date_for_the_toy_dataset)

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    seed_for_the_stable_dataset = params ["parameters_to_create_toy_data"]["seed_for_the_stable_dataset"]
    # Display window sizes
    logging.info("seed_for_the_stable_dataset:")
    logging.info(seed_for_the_stable_dataset)

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    number_of_rows_for_stable_toy_data = params ["parameters_to_create_toy_data"]["number_of_rows_for_stable_toy_data"]
    # Display window sizes
    logging.info("number_of_rows_for_stable_toy_data:")
    logging.info(number_of_rows_for_stable_toy_data)

    return (
        start_date_for_the_toy_dataset,
        number_of_rows_for_stable_toy_data,
        seed_for_the_stable_dataset
    )


In [7]:
(
    start_date_for_the_toy_dataset,
    numbnumber_of_rows_for_stable_toy_data,
    seed_for_the_stable_dataset
    ) = load_and_process_params(path_for_the_json_file)

2024-08-10 20:28:37 - INFO - start_date_for_the_normal_dataset:
2024-08-10 20:28:37 - INFO - 2024-08-10 15:00:00
2024-08-10 20:28:37 - INFO - seed_for_the_stable_dataset:
2024-08-10 20:28:37 - INFO - 300
2024-08-10 20:28:37 - INFO - number_of_rows_for_stable_toy_data:
2024-08-10 20:28:37 - INFO - 10000


In [None]:
import numpy as np
import pandas as pd

def generate_stable_toy_data(number_of_rows: int, start_date: str, seed_for_random: int = None) -> pd.DataFrame:
    # Set the seed for reproducibility
    if seed_for_random is not None:
        np.random.seed(seed_for_random)
    
    # Generate a date range
    date_range = pd.date_range(start=start_date, periods=number_of_rows, freq='H')
    
    # Generate base data with correlations
    # Temperature: Normally distributed around 75°C with small fluctuations
    temperature = np.random.normal(loc=75, scale=1, size=number_of_rows)
    
    # Pressure: Correlated with temperature, slightly decreasing with higher temperatures
    pressure = 3 - 0.01 * (temperature - 75) + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
    
    # Vibration: Low vibration under normal conditions, with slight random noise
    vibration = np.random.normal(loc=0.3, scale=0.05, size=number_of_rows)
    
    # Flow Rate: Generally stable, slightly increasing with lower pressure (inverse correlation)
    flow_rate = 300 + 10 * (3 - pressure) + np.random.normal(loc=0, scale=5, size=number_of_rows)
    
    # Humidity: Independent of the other variables, normal fluctuations
    humidity = np.random.normal(loc=40, scale=5, size=number_of_rows)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Timestamp': date_range,
        'Temperature_C': temperature,
        'Pressure_MPa': pressure,
        'Vibration_mm_s': vibration,
        'Flow_Rate_l_min': flow_rate,
        'Humidity_%': humidity
    })
    
    return df

# Example usage
df_stable = generate_stable_toy_data(number_of_rows=1000, start_date='2024-01-01', seed_for_random=42)
print(df_stable.head())
