In [1]:
import numpy as np
import pandas as pd
from typing import Optional
from datetime import datetime, timezone, timedelta
import sys
import argparse
import os
import warnings
from typing import Optional, Literal, NewType
import json

import logging
from database_generator.logging_configuration import setup_logging_for_this_script
setup_logging_for_this_script()
# Get the logger for this module
logger = logging.getLogger(__name__)

from database_generator.helpers import (
    get_config_path,
    load_and_process_params,
)

from database_generator.get_data import (
    generate_stable_toy_data,
    introduce_exponential_anomalies,
    simulate_broken_sensor,
)

from database_generator.evaluate import (
    overlaid_plots_with_plotly,
)

from database_generator.evaluate import (
    overlaid_plots_with_plotly,
)

from database_generator.db_operations import(
    create_sql_alchemy_engine,
    get_last_timestamp,
    query_data_by_datetime,
    store_pandas_dataframe_into_postegre,
)

In [2]:
# import sys
# print(sys.executable)

# Accessing and reading the config file

In [3]:
# get the path to the .json file from the environment

path_for_the_json_file = get_config_path()
path_for_the_json_file


'/home/aldo/Repositories/general_projects/database_generator/parameters_for_toy_data_experiments.json'

In [4]:
config_dict = load_and_process_params(path_for_the_json_file)

start_date_for_the_toy_dataset = config_dict['start_date_for_the_toy_dataset']
# number_of_rows_for_stable_toy_data = config_dict['number_of_rows_for_stable_toy_data']
seed_for_the_stable_dataset = config_dict['seed_for_the_stable_dataset']


[database_generator.helpers] 2024-09-14 11:53:21 - INFO: number_of_rows_for_stable_toy_data = 10000
[database_generator.helpers] 2024-09-14 11:53:21 - INFO: seed_for_the_stable_dataset = 300
[database_generator.helpers] 2024-09-14 11:53:21 - INFO: start_date_for_the_toy_dataset = 2024-09-08 10:00:00+00:00
[database_generator.helpers] 2024-09-14 11:53:21 - INFO: variable_of_interest = pH
[database_generator.helpers] 2024-09-14 11:53:21 - INFO: path_to_save_the_outcomes = /home/aldo/Repositories/general_projects/database_generator/experiments
[database_generator.helpers] 2024-09-14 11:53:21 - INFO: table_name_to_be_created_on_postgresql = raw_data_version_1


# Create the stable data

In [5]:
# Example usage

main_datetime_in_utc = pd.Timestamp.now(tz='UTC')
start_datetime_in_utc = main_datetime_in_utc - timedelta(hours=24)

df_stable = generate_stable_toy_data(start_date=start_datetime_in_utc,
                                     end_date=main_datetime_in_utc,
                                     seed_for_random=seed_for_the_stable_dataset)

df_stable.head()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-13 18:53:22.730405+00:00,73.51483,3.014574,3.077514,305.986272,37.970487
2024-09-13 18:53:52.730405+00:00,74.739795,3.0176,3.061136,303.005372,44.830887
2024-09-13 18:54:22.730405+00:00,73.575435,3.016737,3.084498,300.608989,28.452018
2024-09-13 18:54:52.730405+00:00,74.108601,2.952131,2.909786,305.601524,44.180012
2024-09-13 18:55:22.730405+00:00,75.761163,2.988104,3.121707,299.776045,43.127562


In [6]:
df_stable.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2881 entries, 2024-09-13 18:53:22.730405+00:00 to 2024-09-14 18:53:22.730405+00:00
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Temperature_C    2881 non-null   float64
 1   Pressure_MPa     2881 non-null   float64
 2   Vibration_mm_s   2881 non-null   float64
 3   Flow_Rate_l_min  2881 non-null   float64
 4   Humidity_%       2881 non-null   float64
dtypes: float64(5)
memory usage: 135.0 KB


In [7]:
df_stable.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 18:51:22.730405+00:00,74.551775,3.025189,3.072515,296.300406,40.543644
2024-09-14 18:51:52.730405+00:00,76.116908,3.046058,2.992902,301.269924,44.875686
2024-09-14 18:52:22.730405+00:00,76.93627,2.990198,3.001342,297.840568,40.337615
2024-09-14 18:52:52.730405+00:00,73.391449,3.101057,2.99458,295.921709,39.376991
2024-09-14 18:53:22.730405+00:00,73.539493,2.978541,2.962288,302.059977,44.125455


### visualize the generated data

In [8]:
fig_stable = overlaid_plots_with_plotly(df=df_stable,
                           # scatter_variables=['Vibration_mm_s', 'Flow_Rate_l_min'],
                           # variable_of_interest='Temperature_C',
                           save_plot=False)

In [9]:
fig_stable.show()

# Create the two types of anomaly to evalaute it

### Problem 1: Bearing Wear
Description: Over time, the bearings in the pump might wear out, causing an increase in vibration levels.


### Problem 5: Broken Temperature Sensor
Description: The temperature sensor might malfunction or break, leading to inaccurate or stuck readings.

- Stuck Readings: The sensor gets "stuck" at a constant value, providing the same reading for a period of time.

- Sudden Jumps: The sensor might suddenly jump to an unusually high or low value, remaining there for some time.

- Intermittent Spikes: The sensor occasionally produces spikes of incorrect readings, either very high or very low.

- Dropouts: The sensor might stop reporting data altogether, which could be simulated as missing values (NaN).

In [16]:
# Introduce bearing wear

start_time_anomaly_exponential = main_datetime_in_utc
end_time_anomaly_exponential = main_datetime_in_utc + timedelta(hours=4)

df_with_anomaly_exponential = introduce_exponential_anomalies(variable='Vibration_mm_s',
                                                  start_date=start_time_anomaly_exponential,
                                                  end_date=end_time_anomaly_exponential,
                                                  increase_rate=0.01
                                                  )


[database_generator.get_data] 2024-09-14 11:54:53 - INFO: Number of rows affected by the anomaly: 481
[database_generator.get_data] 2024-09-14 11:54:53 - DEBUG: Masked DataFrame rows:
                                  Temperature_C  Pressure_MPa  Vibration_mm_s  \
Timestamp                                                                       
2024-09-14 18:53:22.730405+00:00      73.774585      3.009777        3.074307   
2024-09-14 18:53:52.730405+00:00      76.402277      2.967995        2.940000   
2024-09-14 18:54:22.730405+00:00      77.171488      2.983974        3.029532   
2024-09-14 18:54:52.730405+00:00      73.431526      3.006599        2.919340   
2024-09-14 18:55:22.730405+00:00      73.679918      3.008551        2.898663   

                                  Flow_Rate_l_min  Humidity_%  
Timestamp                                                      
2024-09-14 18:53:22.730405+00:00       300.793532   39.367034  
2024-09-14 18:53:52.730405+00:00       302.052203   36.6

In [17]:
df_with_anomaly_exponential.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 22:51:22.730405+00:00,74.328133,2.980727,119.94715,299.012928,39.342022
2024-09-14 22:51:52.730405+00:00,75.205748,3.053513,121.121264,295.698692,45.925745
2024-09-14 22:52:22.730405+00:00,73.474488,2.993726,122.292653,295.089002,42.498895
2024-09-14 22:52:52.730405+00:00,75.169848,2.945165,123.499912,310.761671,41.393374
2024-09-14 22:53:22.730405+00:00,73.225989,2.97364,124.820231,306.453885,44.54807


In [18]:
df_stable.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 18:51:22.730405+00:00,74.551775,3.025189,3.072515,296.300406,40.543644
2024-09-14 18:51:52.730405+00:00,76.116908,3.046058,2.992902,301.269924,44.875686
2024-09-14 18:52:22.730405+00:00,76.93627,2.990198,3.001342,297.840568,40.337615
2024-09-14 18:52:52.730405+00:00,73.391449,3.101057,2.99458,295.921709,39.376991
2024-09-14 18:53:22.730405+00:00,73.539493,2.978541,2.962288,302.059977,44.125455


In [19]:
# Plot the data to see the effect of the anomaly
fig_anomaly_exponential = overlaid_plots_with_plotly(df_with_anomaly_exponential,
                                                     # scatter_variables=['Vibration_mm_s'],
                                                     # variable_of_interest='Temperature_C',
                                                     save_plot=False)



In [20]:
fig_anomaly_exponential.show()

In [18]:
start_datetime_broken_sensor = end_time_anomaly_exponential
end_datetime_broken_sensor = start_datetime_in_utc + timedelta(hours=3)

# Simulate a sensor stuck at a constant value
df_with_sensor_issue_stuck = simulate_broken_sensor(df= df_stable,
                                              variable='Temperature_C',
                                              start_time=start_datetime_broken_sensor,
                                              end_time=end_datetime_broken_sensor,
                                              mode='stuck'
                                              )

# Plot the data to see the effect of the anomaly
fig_anomaly_stuck = overlaid_plots_with_plotly(df_with_sensor_issue_stuck, variable_of_interest='Temperature_C', save_plot=False)

In [19]:
fig_anomaly_stuck.show()

In [20]:
df_with_sensor_issue_stuck.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 18:07:22.992756+00:00,74.551775,3.025189,3.072515,296.300406,40.543644
2024-09-14 18:07:52.992756+00:00,76.116908,3.046058,2.992902,301.269924,44.875686
2024-09-14 18:08:22.992756+00:00,76.93627,2.990198,3.001342,297.840568,40.337615
2024-09-14 18:08:52.992756+00:00,73.391449,3.101057,2.99458,295.921709,39.376991
2024-09-14 18:09:22.992756+00:00,73.539493,2.978541,2.962288,302.059977,44.125455


In [21]:
df_stable.tail()

Unnamed: 0_level_0,Temperature_C,Pressure_MPa,Vibration_mm_s,Flow_Rate_l_min,Humidity_%
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-14 18:07:22.992756+00:00,74.551775,3.025189,3.072515,296.300406,40.543644
2024-09-14 18:07:52.992756+00:00,76.116908,3.046058,2.992902,301.269924,44.875686
2024-09-14 18:08:22.992756+00:00,76.93627,2.990198,3.001342,297.840568,40.337615
2024-09-14 18:08:52.992756+00:00,73.391449,3.101057,2.99458,295.921709,39.376991
2024-09-14 18:09:22.992756+00:00,73.539493,2.978541,2.962288,302.059977,44.125455


In [22]:
# Simulate a sensor stuck at a constant value
df_with_sensor_issue_jump = simulate_broken_sensor(df= df_stable,
                                              variable='Temperature_C',
                                              start_time='2024-09-20 12:00:00+00:00',
                                              end_time='2024-09-27 03:15:00+00:00',
                                              mode='jump'
                                              )

# Plot the data to see the effect of the anomaly
fig_anomaly_jump = overlaid_plots_with_plotly(df_with_sensor_issue_jump, variable_of_interest='Temperature_C', save_plot=False)

In [23]:
# Simulate a sensor stuck at a constant value
df_with_sensor_issue_spike = simulate_broken_sensor(df= df_stable,
                                              variable='Temperature_C',
                                              start_time='2024-09-20 12:00:00+00:00',
                                              end_time='2024-09-27 03:15:00+00:00',
                                              mode='spike'
                                              )

# Plot the data to see the effect of the anomaly
fig_anomaly_spike = overlaid_plots_with_plotly(df_with_sensor_issue_spike, variable_of_interest='Temperature_C', save_plot=False)

In [24]:
# Simulate a sensor stuck at a constant value
df_with_sensor_issue_dropout = simulate_broken_sensor(df= df_stable,
                                              variable='Temperature_C',
                                              start_time='2024-09-20 12:00:00+00:00',
                                              end_time='2024-09-27 03:15:00+00:00',
                                              mode='dropout'
                                              )

# Plot the data to see the effect of the anomaly
fig_anomaly_dropout = overlaid_plots_with_plotly(df_with_sensor_issue_dropout, variable_of_interest='Temperature_C', save_plot=False)

# Service Architecture Overview


## A. Components

- Data Simulation Module
Purpose: Generate synthetic datasets that represent normal operational conditions.
Functions:
generate_stable_toy_data: Generates the baseline dataset.
introduce_exponential_anomalies: Simulates anomalies like bearing wear.
simulate_broken_sensor: Introduces faults such as sensor failures.

- Data Ingestion Service
Purpose: Writes the generated/simulated data into a PostgreSQL database.
Components:
A script or service (e.g., using Python and psycopg2 or SQLAlchemy) to connect to the PostgreSQL database and insert the generated data.

- PostgreSQL Database
Purpose: Stores the simulated data. This data can be queried by other services/modules for anomaly detection and analysis.
Schema Design:
Table Structure: Design tables to hold time-series data with columns like timestamp, vibration_level, pressure, temperature, anomaly_flag, etc.
Indexing: Ensure the timestamp field is indexed for efficient querying.

- Anomaly Detection Module (External Service)
Purpose: Consumes data from the PostgreSQL database, applies anomaly detection algorithms, and flags potential issues.
Data Flow: Queries the database periodically or in real-time and applies models like autoencoders, statistical methods, or machine learning algorithms.

- Monitoring and Logging
Purpose: Monitors the service performance, logs errors, and ensures data integrity.
Components: Tools like Prometheus for monitoring and Grafana for visualization. Logs can be stored locally or in a logging service.

## B. Workflow
Data Generation and Simulation:

The service periodically or on-demand runs the generate_stable_toy_data function to create a stable dataset.
Anomalies are introduced using introduce_exponential_anomalies and simulate_broken_sensor functions.
Data Ingestion:

The simulated data is sent to the Data Ingestion Service, which connects to the PostgreSQL database and inserts the data into the appropriate tables.
Database Storage:

The PostgreSQL database stores the time-series data along with any anomaly flags or metadata that might be useful for downstream analysis.
Anomaly Detection:

The Anomaly Detection Module queries the database, retrieves the data, and applies algorithms to detect anomalies. Detected anomalies are flagged and stored back in the database or sent to an alerting system.
Monitoring:

The entire process is monitored for performance and reliability. Logs are reviewed to ensure data integrity, and alerts are triggered for any unexpected behavior.


## Implementation Steps

 - A. Setting Up the PostgreSQL Database
Create the Database:
Install PostgreSQL and create a new database for the service.
Design the Schema:
Define tables for storing the time-series data, ensuring that they are normalized and indexed appropriately.

- B. Implement the Data Simulation Module
Refactor Existing Functions:

Refactor generate_stable_toy_data, introduce_exponential_anomalies, and simulate_broken_sensor to be callable by the service.
Integrate with the Data Ingestion Service:

Implement a Python script or service that runs these functions and writes the results to the PostgreSQL database.

- C. Implement the Data Ingestion Service
Database Connection:
Use libraries like psycopg2 or SQLAlchemy to connect to the PostgreSQL database.
Data Insertion Logic:
Implement the logic to insert the generated data into the database, ensuring proper handling of timestamps and other relevant metadata.

- D. Anomaly Detection Integration
Develop or Integrate Anomaly Detection Algorithms:

Implement or integrate existing anomaly detection algorithms that will consume the data from the PostgreSQL database.
Store Results:

Store the results of the anomaly detection in the same database or send them to a monitoring/alerting system.

- E. Monitoring and Logging
Set Up Monitoring Tools:

Use Prometheus to monitor service metrics and Grafana to visualize them.
Implement Logging:

Ensure that all critical operations are logged, and set up error-handling mechanisms.

## Tools and Technologies
Python: Core language for scripting, data simulation, and ingestion.
PostgreSQL: Database for storing and querying time-series data.
SQLAlchemy/psycopg2: Libraries for database interaction.
Prometheus/Grafana: Monitoring and visualization.
Docker (Optional): For containerizing the service to ensure consistency across environments.

## Future Considerations
Scalability: Ensure the system can handle increasing volumes of data as the service expands.
Real-Time Processing: Consider integrating real-time data processing pipelines if needed.
Security: Implement proper security measures for database access and data handling.

# Identify the Services to Containerize
Based on this architecture, the following components can be containerized:

- Data Simulation and Ingestion Service
- PostgreSQL Database
- Anomaly Detection Service
- Monitoring and Logging Tools (Prometheus and Grafana)


### Putting It All Together with Docker Compose
Use Docker Compose to orchestrate all the services:

### Suggested SQLAlchemy Methods to Build:

- Setup: Connecting to the Database
- Create a Table for Pump Data
- Insert Data into the Table
- Query Data from the Table
- Update Data in the Table
- Delete Data from the Table
- Use SQLAlchemy ORM to Define Models and Perform CRUD Operations