In [1]:
import numpy as np
import pandas as pd
from typing import Optional
from datetime import datetime, timezone
import logging

import argparse
import os
import warnings
from typing import Optional, Literal, NewType
import json


# Get the logger for this module
logger = logging.getLogger(__name__)


In [2]:
from anomaly_detection_training_module_v1 import timestamp_for_this_experiment # get global variable from __init__.py


2024-08-24 09:57:10 - INFO - Logging is set up correctly.


# Accessing and reading the config file

In [6]:
def get_config_path():
    # Check if the environment variable is set
    env_path = os.getenv('PATH_TO_THE_CONFIGURATION_FILE')
    
    if env_path:
        return env_path
    
    # If not, parse the command-line arguments
    parser = argparse.ArgumentParser(description='Provide the path to the configuration file.')
    parser.add_argument('--config', type=str, help='Path to the configuration file')
    args = parser.parse_args()
    
    if args.config:
        return args.config
    else:
        logging.error("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")
        
        raise ValueError("Configuration file path must be provided"
                         "either as an environment variable 'PATH_TO_THE_CONFIGURATION_FILE'"
                         "or as a command-line argument '--config'.")

In [17]:
def load_and_process_params(file_path: str) -> tuple:
    # Load parameters from JSON file
    with open(file_path, "r") as file:
        params = json.load(file)


    # Parse the timestamps with error handling
    def parse_timestamp(timestamp_str: str) -> datetime:
        try:
            # Replace '-00' with '+00:00' to make it a valid ISO 8601 format
            if timestamp_str.endswith('-00'):
                timestamp_str = timestamp_str.replace('-00', '+00:00')
            return datetime.fromisoformat(timestamp_str)
        except ValueError as e:
            logging.error(f"Error parsing timestamp: {timestamp_str}. {e}"
                          "datetime should be in ISO 8601 format")
            raise


    start_date_for_the_toy_dataset = parse_timestamp(params["parameters_to_create_toy_data"]["start_date_for_the_toy_dataset"])
    logging.info(f'{start_date_for_the_toy_dataset = }')

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    seed_for_the_stable_dataset = params ["parameters_to_create_toy_data"]["seed_for_the_stable_dataset"]
    logging.info(f'{seed_for_the_stable_dataset = }')

    # Access nested parameter maps under the 'parameters_to_create_toy_data' key
    number_of_rows_for_stable_toy_data = params ["parameters_to_create_toy_data"]["number_of_rows_for_stable_toy_data"]
    logging.info(f'{number_of_rows_for_stable_toy_data = }')

    return (
        start_date_for_the_toy_dataset,
        number_of_rows_for_stable_toy_data,
        seed_for_the_stable_dataset
    )


In [18]:
# get the path to the .json file from the environment

path_for_the_json_file = get_config_path()
path_for_the_json_file


'/home/aldo/Repositories/general_projects/database_generator/parameters_for_toy_data_experiments.json'

In [19]:
(
    start_date_for_the_toy_dataset,
    number_of_rows_for_stable_toy_data,
    seed_for_the_stable_dataset
    ) = load_and_process_params(path_for_the_json_file)

2024-08-24 10:38:46 - INFO - start_date_for_the_toy_dataset = datetime.datetime(2024, 8, 23, 10, 0, tzinfo=datetime.timezone.utc)
2024-08-24 10:38:46 - INFO - seed_for_the_stable_dataset = 300
2024-08-24 10:38:46 - INFO - number_of_rows_for_stable_toy_data = 10000


In [20]:
import numpy as np
import pandas as pd

def generate_stable_toy_data(number_of_rows: int, start_date: str, seed_for_random: int = None) -> pd.DataFrame:
    # Set the seed for reproducibility
    if seed_for_random is not None:
        np.random.seed(seed_for_random)
    
    # Generate a date range
    date_range = pd.date_range(start=start_date, periods=number_of_rows, freq='5min', tz='UTC')
    
    # Generate base data with correlations
    # Temperature: Normally distributed around 75°C with small fluctuations
    temperature = np.random.normal(loc=75, scale=1, size=number_of_rows)
    
    # Pressure: Correlated with temperature, slightly decreasing with higher temperatures
    pressure = 3 - 0.01 * (temperature - 75) + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
    
    # Flow Rate: Generally stable, slightly increasing with lower pressure (inverse correlation)
    flow_rate = 300 + 10 * (3 - pressure) + np.random.normal(loc=0, scale=5, size=number_of_rows)
    
    # Vibration: Non-linear increase with flow_rate and pressure
    vibration = 0.1 * np.sqrt(flow_rate * pressure) + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
    
    # Humidity: Independent of the other variables, normal fluctuations
    humidity = np.random.normal(loc=40, scale=5, size=number_of_rows)
    
    # Create a DataFrame
    df = pd.DataFrame({
        'Timestamp': date_range,
        'Temperature_C': temperature,
        'Pressure_MPa': pressure,
        'Vibration_mm_s': vibration,
        'Flow_Rate_l_min': flow_rate,
        'Humidity_%': humidity
    })
    
    # Set Timestamp as the index
    df.set_index('Timestamp', inplace=True)
    
    return df


In [21]:
# Example usage
df_stable = generate_stable_toy_data(number_of_rows=number_of_rows_for_stable_toy_data, start_date=start_date_for_the_toy_dataset, seed_for_random=42)

df_stable.head()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-08-23 10:00:00+00:00,75.496714,2.961108,2.892026,302.13035,37.640712
2024-08-23 10:05:00+00:00,74.861736,2.986108,2.948046,301.555541,45.063512
2024-08-23 10:10:00+00:00,75.647689,2.963654,2.930878,295.68086,39.009066
2024-08-23 10:15:00+00:00,76.52303,2.990291,3.017539,302.995015,40.452846
2024-08-23 10:20:00+00:00,74.765847,3.0622,3.041092,291.927582,43.586953


In [23]:
df_stable.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-27 02:55:00+00:00,76.301102,3.00638,3.044368,297.049757,40.283995
2024-09-27 03:00:00+00:00,73.001655,3.088852,3.089324,306.334502,39.875386
2024-09-27 03:05:00+00:00,74.294683,3.025971,3.029858,297.233455,42.500424
2024-09-27 03:10:00+00:00,75.495766,3.080719,3.028339,289.731028,41.326077
2024-09-27 03:15:00+00:00,75.644388,2.91256,2.972425,299.729978,47.579055


### create the two types of anomaly to evalaute it

Visualization of this data

Problem 1: Bearing Wear
Description: Over time, the bearings in the pump might wear out, causing an increase in vibration levels.


Problem 5: Broken Temperature Sensor
Description: The temperature sensor might malfunction or break, leading to inaccurate or stuck readings.

In [79]:
df_stable.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10000 entries, 2024-08-10 15:00:00+00:00 to 2024-09-14 08:15:00+00:00
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Temperature_C    10000 non-null  float64
 1   Pressure_MPa     10000 non-null  float64
 2   Vibration_mm_s   10000 non-null  float64
 3   Flow_Rate_l_min  10000 non-null  float64
 4   Humidity_%       10000 non-null  float64
dtypes: float64(5)
memory usage: 468.8 KB


In [24]:
import plotly.graph_objs as go

def overlaid_plots_with_plotly(df: pd.DataFrame, scatter_variable: str = None, variable_of_interest: str = None, save_plot: bool = True, save_path: str = 'tmp/', filename: str = 'overlayed_overview.html') -> go.Figure:
    """
    Creates an interactive Plotly plot overlaying all columns of a DataFrame, with one column as a scatter plot and an optional secondary y-axis.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing time series data.
    scatter_variable (str): Column name to be displayed as a scatter plot.
    variable_of_interest (str): Column name to be displayed on the secondary y-axis.
    save_plot (bool): Whether to save the plot as an HTML file. Defaults to True.
    save_path (str): Directory path where the plot will be saved. Defaults to 'tmp/'.
    filename (str): The filename for the saved plot. Defaults to 'overlayed_overview.html'.

    Returns:
    go.Figure: A Plotly Figure object containing the interactive plot with all data series.
    """
    # Initialize the Plotly figure
    fig = go.Figure()

    # Add scatter plot if a scatter variable is specified
    if scatter_variable:
        fig.add_trace(go.Scatter(
            x=df.index,
            y=df[scatter_variable],
            mode='markers',
            marker=dict(color='red', size=8),
            name=scatter_variable
        ))

    # Add line plots for the remaining variables except the variable_of_interest
    for column in df.columns:
        if column != scatter_variable and column != variable_of_interest:
            fig.add_trace(go.Scatter(
                x=df.index,
                y=df[column],
                mode='lines',
                name=column,
                yaxis='y1'
            ))

    # Add the variable_of_interest on the secondary y-axis if specified
    if variable_of_interest:
        fig.add_trace(go.Scatter(
            x=df.index,
            y=df[variable_of_interest],
            mode='lines',
            name=variable_of_interest,
            yaxis='y2',
            line=dict(color='blue', dash='dash')  # Customize the line for distinction
        ))

        # Update the layout to include a secondary y-axis
        fig.update_layout(
            yaxis2=dict(
                title=variable_of_interest,
                overlaying='y',
                side='right'
            )
        )

    # Update layout for better visualization
    fig.update_layout(
        title="Overlayed Overview",
        xaxis_title="Time",
        yaxis_title="Primary Axis",
        legend_title="Variables",
        width=1000,
        height=500,
        hovermode="x unified"
    )

    # Save the plot if required
    if save_plot:
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        fig.write_html(os.path.join(save_path, filename))

    return fig

In [25]:
overlaid_plots_with_plotly(df=df_stable, scatter_variable='Flow_Rate_l_min', variable_of_interest='Temperature_C', save_plot=False)
