# DEMO version

### Set up the environmental variables
It's recommended to create a ".env" file to setup the logger configuration

You must set the 'PATH_TO_SAVE_THE_OUTCOMES' as an env variable
Example:
###### PATH_TO_SAVE_THE_OUTCOMES=/workspace/general_projects/database_toolkit/notebooks/tmp


In [1]:
from dotenv import load_dotenv
import os

# Load the .env file only if it exists
dotenv_path = '/workspace/general_projects/database_generator/.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    print(f"Loaded environment variables from {dotenv_path}")
else:
    print(f"No .env file found at {dotenv_path}, relying on system environment variables.")

# # Access the environment variable, with a fallback
# path_to_logs = os.getenv('PATH_TO_SAVE_THE_LOGS')
# print(f"Logs will be saved to: {path_to_logs}")

# Access the environment variable, with a fallback
path_to_logs = os.getenv('PATH_TO_SAVE_THE_OUTCOMES')
print(f"Logs will be saved to: {path_to_logs}")

Loaded environment variables from /workspace/general_projects/database_generator/.env
Logs will be saved to: /workspace/general_projects/database_generator/notebooks/tmp


In [2]:
print(os.environ)

environ({'HOSTNAME': 'c38c0673b4b7', 'HOME': '/root', 'PYTHONUNBUFFERED': '1', 'GPG_KEY': 'A035C8C19219BA821ECEA86B64E628F8D684696D', 'PYTHON_SHA256': '07a4356e912900e61a15cb0949a06c4a05012e213ecd6b4e84d0f67aabbee372', 'PATH': '/workspace/general_projects/database_toolkit/.venv/bin:/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807/bin/remote-cli:/root/.pyenv/shims:/root/.pyenv/bin:/root/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'LANG': 'C.UTF-8', 'SHELL': '/usr/bin/zsh', 'PYTHON_VERSION': '3.11.10', 'PWD': '/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807', 'PYENV_ROOT': '/root/.pyenv', 'VSCODE_CWD': '/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807', 'VSCODE_NLS_CONFIG': '{"userLocale":"en","osLocale":"en","resolvedLanguage":"en","defaultMessagesFile":"/vscode/vscode-server/bin/linux-x64/e8653663e8840adaf45af01eab5c627a5af81807/out/nls.messages.json","local

1. Generating Standard Data
Create a dataset representing industrial pump operations.

In [3]:
from database.toolkit.data_generator import IndustrialPumpData
from datetime import datetime, timezone

start_datetime = datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc)
end_datetime = datetime(2025, 1, 4, 0, 0, tzinfo=timezone.utc)
frequency = '1h'
seed_for_random = 42
data_generator = IndustrialPumpData(
    start_datetime=start_datetime,
    end_datetime=end_datetime,
    frequency=frequency,
    seed_for_random=seed_for_random
)


In [4]:
standard_pump_data = data_generator.generate_standard_data()

[database.toolkit.data_generator] 2025-02-10 23:20:23 - INFO: A new standard dataset was created


In [5]:
standard_pump_data.columns

Index(['temperature_c', 'pressure_mpa', 'vibration_mm_s', 'flow_rate_l_min',
       'humidity_%', 'flag_normal_data'],
      dtype='object')

2. Introducing Anomalies
Apply an exponential anomaly to simulate increasing deviations.

In [6]:
from database.toolkit.data_generator import ExponentialAnomaly

exponential_anomaly_in_pressure = ExponentialAnomaly(
    start_datetime= datetime(2025, 1, 1, 0, 0, tzinfo=timezone.utc),
    end_datetime= datetime(2025, 1, 1, 23, 0, tzinfo=timezone.utc),
    variable_to_insert_anomalies='pressure_mpa',
    standard_data=standard_pump_data
)

In [7]:
anomalous_data = exponential_anomaly_in_pressure.introduce_anomaly()

[database.toolkit.data_generator] 2025-02-10 23:20:23 - DEBUG: Number of rows affected by the anomaly: 24
[database.toolkit.data_generator] 2025-02-10 23:20:23 - INFO:  A new dataset with 24 anomalies was created


In [8]:
anomalous_data.tail()

Unnamed: 0_level_0,temperature_c,pressure_mpa,vibration_mm_s,flow_rate_l_min,humidity_%,flag_normal_data
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-01 19:00:00+00:00,73.587696,3.198301,2.96771,302.273701,45.493884,False
2025-01-01 20:00:00+00:00,76.465649,3.20103,3.04897,304.420697,44.127082,False
2025-01-01 21:00:00+00:00,74.774224,3.227646,2.999454,309.657441,44.067548,False
2025-01-01 22:00:00+00:00,75.067528,3.184202,2.954695,299.51157,46.527394,False
2025-01-01 23:00:00+00:00,73.575252,3.300303,3.019278,295.940784,40.105019,False


In [9]:
anomalous_data.head()

Unnamed: 0_level_0,temperature_c,pressure_mpa,vibration_mm_s,flow_rate_l_min,humidity_%,flag_normal_data
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-01 00:00:00+00:00,75.496714,3.073265,3.005294,293.082596,37.053176,False
2025-01-01 01:00:00+00:00,74.861736,2.880885,3.024148,294.693763,44.24801,False
2025-01-01 02:00:00+00:00,75.647689,3.055707,2.935258,302.263525,41.785077,False
2025-01-01 03:00:00+00:00,76.52303,3.020922,3.036813,301.593703,36.535452,False
2025-01-01 04:00:00+00:00,74.765847,3.030014,2.919924,301.378553,44.497999,False


Apply an intermittent spike anomaly to simulate sudden outliers.

In [10]:
# Create an IntermittentSpikeAnomaly instance
from database.toolkit.data_generator import IntermittentSpikeAnomaly

spike_anomaly = IntermittentSpikeAnomaly(
    start_datetime=datetime(2025, 1, 2, 0, 0, tzinfo=timezone.utc),
    end_datetime=datetime(2025, 1, 2, 23, 59, tzinfo=timezone.utc),
    variable_to_insert_anomalies="pressure_mpa",
    standard_data=standard_pump_data,
    spike_fraction=0.30,
    spike_multiplier=100.0,
    seed_for_random=42
)

# Introduce anomalies
spike_data = spike_anomaly.introduce_anomaly()

[database.toolkit.data_generator] 2025-02-10 23:20:23 - DEBUG: Number of rows in the specified time range: 24
[database.toolkit.data_generator] 2025-02-10 23:20:23 - INFO: Introduced 7 spike anomalies in 'pressure_mpa'.


In [11]:
spike_data

Unnamed: 0_level_0,temperature_c,pressure_mpa,vibration_mm_s,flow_rate_l_min,humidity_%,flag_normal_data
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02 08:00:00+00:00,74.986503,7.564503,3.048376,298.473341,34.969913,False
2025-01-02 16:00:00+00:00,75.738467,7.564503,3.102224,301.208414,40.379023,False
2025-01-02 00:00:00+00:00,74.455617,7.564503,3.029197,295.367462,43.409765,False
2025-01-02 18:00:00+00:00,74.884352,7.564503,2.983007,295.603693,44.875599,False
2025-01-02 11:00:00+00:00,73.779156,7.564503,2.913684,295.463353,43.958313,False
2025-01-02 09:00:00+00:00,73.942289,7.564503,3.221294,312.551982,33.929057,False
2025-01-02 13:00:00+00:00,73.04033,7.564503,3.040563,303.175781,43.141728,False


3. Creating a Combined Database
Merge multiple datasets while prioritizing anomalous records.

In [12]:
from database.toolkit.data_generator import SimpleDatabaseFactory
list_of_dfs = [standard_pump_data,
               anomalous_data,
               spike_data]

factory = SimpleDatabaseFactory(list_of_df=list_of_dfs, flag_column='flag_normal_data')
final_database = factory.create_database()

In [13]:
final_database.head()

Unnamed: 0_level_0,temperature_c,pressure_mpa,vibration_mm_s,flow_rate_l_min,humidity_%,flag_normal_data
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02 13:00:00+00:00,73.04033,7.564503,3.040563,303.175781,43.141728,False
2025-01-01 00:00:00+00:00,75.496714,3.073265,3.005294,293.082596,37.053176,False
2025-01-01 01:00:00+00:00,74.861736,2.880885,3.024148,294.693763,44.24801,False
2025-01-01 02:00:00+00:00,75.647689,3.055707,2.935258,302.263525,41.785077,False
2025-01-01 03:00:00+00:00,76.52303,3.020922,3.036813,301.593703,36.535452,False


In [14]:
final_database.tail()

Unnamed: 0_level_0,temperature_c,pressure_mpa,vibration_mm_s,flow_rate_l_min,humidity_%,flag_normal_data
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-01-02 06:00:00+00:00,74.398293,2.965903,3.00517,300.405977,35.908897,True
2025-01-02 05:00:00+00:00,74.708306,2.985781,3.102419,304.278104,42.975785,True
2025-01-02 04:00:00+00:00,74.399361,2.984974,2.967451,301.533713,40.48498,True
2025-01-02 03:00:00+00:00,75.375698,2.925474,2.950833,302.451015,39.349285,True
2025-01-03 03:00:00+00:00,74.614918,3.033194,3.09081,300.436689,37.465284,True
