In [1]:
import numpy as np
import pandas as pd
from typing import Optional

import logging
import warnings
from datetime import datetime, timezone



In [None]:
def generate_stable_toy_data(number_of_rows: int, start_date: str, seed_for_random: int = None) -> pd.DataFrame:
    """
    Generates a DataFrame with stable time series data for pH, salinity, and temperature.
    These variables are just simple examples of variables that we might find in real datasets.

    Parameters:
    number_of_rows (int): Number of rows of data to generate.
    seed_for_random (int): Seed for random number generation to ensure reproducibility.
    start_date (str): The start date for the time series data. e.g '2024-01-01'

    Returns:
    pd.DataFrame: A DataFrame with columns 'pH', 'salinity', 'temperature', and a date range index.
    """
    # Set the seed for reproducibility
    if seed_for_random == None:
        pass
    else:
        np.random.seed(seed_for_random)

    # Generate time series data
    t = np.arange(number_of_rows)
    pH = 7 + np.random.randn(number_of_rows) * np.sin(0.010 * t)
    salinity = 35 + np.random.randn(number_of_rows) * np.sin(0.018 * t)
    temperature = 20 + np.random.randn(number_of_rows) * np.sin(0.015 * t)

    # Create the DataFrame
    df = pd.DataFrame({
        'pH': pH,
        'salinity': salinity,
        'temperature': temperature
    })

    # Generate a date range and set as index
    date_range = pd.date_range(start=start_date, periods=number_of_rows, freq='5s', tz='UTC')
    df.index = date_range

    return df

def generate_anomalous_toy_data(
    number_of_rows: int,
    start_date: str,
    anomaly_indices_spikes: list[int],
    anomaly_indices_drops: list[int],
    seed_for_random: Optional[int] = None
) -> tuple[pd.DataFrame, pd.DataFrame, int]:
    """
    Generates a DataFrame with anomalous time series data for pH, salinity, and temperature,
    including specified spikes and drops.

    Parameters:
    number_of_rows (int): Number of rows of data to generate.
    start_date (str): The start date for the time series data.
    anomaly_indices_spikes (List[int]): Indices at which to introduce spikes.
    anomaly_indices_drops (List[int]): Indices at which to introduce drops.
    seed_for_random (Optional[int]): Seed for random number generation to ensure reproducibility. Defaults to None.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame, int]: A tuple containing the DataFrame with anomalies, 
                                             a DataFrame with only the anomalies, 
                                             and the total number of anomalies.
    """
    # Set the seed for reproducibility
    if seed_for_random is not None:
        np.random.seed(seed_for_random)

    # Generate time series data
    t = np.arange(number_of_rows)
    pH = 7 + np.random.randn(number_of_rows) * np.sin(0.010 * t)
    salinity = 35 + np.random.randn(number_of_rows) * np.sin(0.018 * t)
    temperature = 20 + np.random.randn(number_of_rows) * np.sin(0.015 * t)

    # Introduce spikes
    pH[anomaly_indices_spikes] += 15
    salinity[anomaly_indices_spikes] += 30
    temperature[anomaly_indices_spikes] += 30

    # Introduce drops
    pH[anomaly_indices_drops] -= 15
    salinity[anomaly_indices_drops] -= 30
    temperature[anomaly_indices_drops] -= 30

    # Create the DataFrame
    df_anomalous = pd.DataFrame({
        'pH': pH,
        'salinity': salinity,
        'temperature': temperature
    })

    # Generate a date range and set as index
    date_range = pd.date_range(start=start_date, periods=number_of_rows, freq='5s', tz='UTC')
    df_anomalous.index = date_range

    # Prepare anomalies DataFrame
    anomaly_indices = anomaly_indices_spikes + anomaly_indices_drops
    anomalies_df = df_anomalous.loc[df_anomalous.index[anomaly_indices]]

    # Calculate total number of anomalies
    num_anomalies = len(anomaly_indices)

    return df_anomalous, anomalies_df, num_anomalies


