In [6]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
from dataclasses import dataclass, field
import logging


In [7]:
datetime(2023, 1, 2, 0, 0, tzinfo=timezone.utc)

datetime.datetime(2023, 1, 2, 0, 0, tzinfo=datetime.timezone.utc)

In [3]:
from dotenv import load_dotenv
import os

# Load the .env file only if it exists
dotenv_path = '/workspace/general_projects/database_generator/.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    print(f"Loaded environment variables from {dotenv_path}")
else:
    print(f"No .env file found at {dotenv_path}, relying on system environment variables.")

# Access the environment variable, with a fallback
path_to_logs = os.getenv('PATH_TO_SAVE_THE_LOGS')
print(f"Logs will be saved to: {path_to_logs}")

Loaded environment variables from /workspace/general_projects/database_generator/.env
Logs will be saved to: /workspace/general_projects/database_generator/notebooks/tmp


In [4]:
path_to_logs

'/workspace/general_projects/database_generator/notebooks/tmp'

In [9]:
from dbtoolkit.logging_configuration import setup_logging_for_this_script
setup_logging_for_this_script()
# Get the logger for this module
logger = logging.getLogger(__name__)

In [2]:
import os
print(os.getenv("PATH_TO_SAVE_THE_LOGS"))

None


In [3]:
print(os.environ)

environ({'HOSTNAME': 'c38c0673b4b7', 'HOME': '/root', 'PYTHONUNBUFFERED': '1', 'GPG_KEY': 'A035C8C19219BA821ECEA86B64E628F8D684696D', 'PYTHON_SHA256': '07a4356e912900e61a15cb0949a06c4a05012e213ecd6b4e84d0f67aabbee372', 'PATH': '/workspace/general_projects/DBToolkit/.venv/bin:/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807/bin/remote-cli:/root/.pyenv/shims:/root/.pyenv/bin:/root/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'LANG': 'C.UTF-8', 'SHELL': '/bin/bash', 'PYTHON_VERSION': '3.11.10', 'PWD': '/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807', 'PYENV_ROOT': '/root/.pyenv', 'VSCODE_CWD': '/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807', 'VSCODE_NLS_CONFIG': '{"userLocale":"en","osLocale":"en","resolvedLanguage":"en","defaultMessagesFile":"/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807/out/nls.messages.json","locale":"en","a

In [None]:
def validate_and_convert_datetime(dt: datetime) -> datetime:
    """
    Validates if the input is a datetime object with a UTC timezone. If not, converts it to the correct format.

    Parameters:
    -----------
    dt : datetime
        The input datetime object to validate and convert.

    Returns:
    --------
    datetime
        A timezone-aware datetime object in UTC.
    """
    # Check if the input is already a pandas Timestamp or Python datetime in UTC
    if isinstance(dt, pd.Timestamp):
        if dt.tz is None:
            logger.warning(f"Datetime '{dt}' is not timezone-aware. Converting to UTC.")
            dt = dt.tz_localize("UTC")
        elif dt.tzname() != "UTC":
            # logger.warning(f"Datetime '{dt}' is not in UTC. Converting to UTC.")
            dt = dt.tz_convert("UTC")
        else:
            # Correct format and timezone; no need to convert
            logger.info(f"Datetime '{dt}' is already in the correct format.")
            return dt
    elif isinstance(dt, datetime):
        if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
            logger.warning(f"Datetime '{dt}' is not timezone-aware. Converting to UTC.")
            dt = dt.replace(tzinfo=datetime.timezone.utc)
        elif dt.tzinfo != datetime.timezone.utc:
            logger.warning(f"Datetime '{dt}' is not in UTC. Converting to UTC.")
            dt = dt.astimezone(datetime.timezone.utc)
        else:
            # Correct format and timezone; no need to convert
            logger.info(f"Datetime '{dt}' is already in the correct format.")
            return dt
    else:
        logger.warning(
            f"Input '{dt}' is not a recognized datetime format. Converting to UTC."
        )
        dt = pd.to_datetime(dt, utc=True)

    return dt


def validate_datetime_order(start_datetime: datetime, end_datetime: datetime):
    """
    Validates that the end_datetime is after the start_datetime.

    Parameters:
    -----------
    start_datetime : datetime
        The start datetime of the time series.
    end_datetime : datetime
        The end datetime of the time series.

    Raises:
    -------
    ValueError
        If end_datetime is not after start_datetime.
    """
    if end_datetime <= start_datetime:
        logger.error(
            f"end_datetime ({end_datetime}) must be after start_datetime ({start_datetime})."
        )
        raise ValueError(
            f"end_datetime ({end_datetime}) must be after start_datetime ({start_datetime})."
        )


def validate_time_range(start_datetime: pd.Timestamp, end_datetime: pd.Timestamp, frequency: str):
    """
    Validates that the time range between start_datetime and end_datetime
    is greater than or equal to the provided frequency.

    Parameters:
    -----------
    start_datetime : pd.Timestamp
        The start of the time range.
    end_datetime : pd.Timestamp
        The end of the time range.
    frequency : str
        The frequency at which data should be generated (e.g., '30s', '1min').

    Raises:
    -------
    ValueError
        If the time range is smaller than the provided frequency.
    """
    # Calculate the time difference between start_datetime and end_datetime
    time_difference = end_datetime - start_datetime
    frequency_timedelta = pd.to_timedelta(frequency)

    # Check if the time difference is smaller than the frequency
    if time_difference < frequency_timedelta:
        error_message = (
            f"Time range is too short: {time_difference} is smaller than the frequency {frequency_timedelta}. "
            "Start datetime must be before end datetime by at least the frequency."
        )
        # Log the error
        logger.error(error_message)

        # Raise a ValueError with the same message
        raise ValueError(error_message)


In [None]:


class RawDataGenerator(ABC):
    """
    Docstring 1
    """
    
    @abstractmethod
    def generate_stable_data(self) -> pd.DataFrame:
        """
        Docstring 2
        """
        pass

class AnomalyInjection(ABC):
    """
    Docstring
    """

    @abstractmethod
    def insert_anomalies(self) -> pd.DataFrame:
        """
        Docsting
        """
        pass


@dataclass
class IndustrialPumpRawData(RawDataGenerator):
    start_datetime: datetime
    end_datetime: datetime
    frequency:str = '30s'
    seed_for_random: int = None

    def generate_stable_data(self) -> pd.DataFrame:
        """
        Generate a stable toy dataset for an industrial pump system with correlated variables.

        Parameters:
        -----------
        start_datetime : datetime
            The start datetime of the time series.
        end_datetime : datetime
            The end datetime of the time series.
        seed_for_random : int, optional
            Seed for random number generation for reproducibility. Defaults to None.

        Returns:
        --------
        pd.DataFrame
            A DataFrame containing the generated time series data with a 'Timestamp' index.
        """
        # Validate and convert datetime inputs
        self.start_datetime = validate_and_convert_datetime(self.start_datetime)
        self.end_datetime = validate_and_convert_datetime(self.end_datetime)

        # Validate that end_datetime is after start_datetime
        validate_datetime_order(self.start_datetime, self.end_datetime)

        # Check that the time range is larger than or equal to the frequency
        validate_time_range(self.start_datetime, self.end_datetime, self.frequency)

        # Set the seed for reproducibility
        if self.seed_for_random is not None:
            np.random.seed(self.seed_for_random)

        # Generate a date range
        date_range = pd.date_range(
            start=self.start_datetime, end=self.end_datetime, freq=self.frequency, tz="UTC"
        )
        number_of_rows = len(date_range)

        # Generate base data with correlations
        temperature = np.random.normal(loc=75, scale=1, size=number_of_rows)
        pressure = (
            3
            - 0.01 * (temperature - 75)
            + np.random.normal(loc=0, scale=0.05, size=number_of_rows)
        )
        flow_rate = (
            300
            + 10 * (3 - pressure)
            + np.random.normal(loc=0, scale=5, size=number_of_rows)
        )
        vibration = 0.1 * np.sqrt(flow_rate * pressure) + np.random.normal(
            loc=0, scale=0.05, size=number_of_rows
        )
        humidity = np.random.normal(loc=40, scale=5, size=number_of_rows)

        # Create a DataFrame
        df = pd.DataFrame(
            {
                "Timestamp": date_range,
                "Temperature_C": temperature,
                "Pressure_MPa": pressure,
                "Vibration_mm_s": vibration,
                "Flow_Rate_l_min": flow_rate,
                "Humidity_%": humidity,
            }
        )

        # Set Timestamp as the index
        df.set_index("Timestamp", inplace=True)

        return df




