# Generate + Filter data in the same File

In [None]:
spark_release='spark-3.4.2'
hadoop_version='hadoop3'

import os, time
start=time.time()
os.environ['SPARK_RELEASE']=spark_release
os.environ['HADOOP_VERSION']=hadoop_version
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_release}-bin-{hadoop_version}"

In [None]:
!pip install faker pysqlite3
!pip install mysql.connector
!pip install pyspark



## Spark

In [None]:
# Run below commands in google colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install Java8
!wget -q http://apache.osuosl.org/spark/${SPARK_RELEASE}/${SPARK_RELEASE}-bin-${HADOOP_VERSION}.tgz # download spark-3.3.X
!tar xf ${SPARK_RELEASE}-bin-${HADOOP_VERSION}.tgz # unzip it
!pip install -q findspark # install findspark

In [None]:
import multiprocessing
import multiprocessing
import pyspark
import socket
import uuid
import findspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.streaming import DataStreamReader

import sqlite3
from faker import Faker
import random
import datetime
import json
fake=Faker()

In [None]:
# findspark find your Spark Distribution and sets necessary environment variables
findspark.init()

In [None]:
# Check the pyspark version
print(pyspark.__version__)

3.5.0


In [None]:
# Create a DStream that will connect to hostname:port, like localhost:9999
# if doing this over a network, firewalls may block the connection!
hostname=socket.gethostname()

hostname

'05a220fe72d3'

In [None]:
app_id=str(uuid.uuid1())

app_id

'2173ace4-a9bd-11ee-9fbc-0242ac1c000c'

In [None]:
conf = SparkConf()

conf.setAll([
     ('spark.app.name', app_id),
     ('spark.shuffle.useOldFetchProtocol', 'true'),
     ('spark.testing', 'true'), # Avoid minimum 450M executor/driver memory https://www.waitingforcode.com/apache-spark/troubleshooting-system-memory-must-be-at-least-error/read / https://programmerclick.com/article/72821685476/
     ('spark.driver.allowMultipleContexts','true'), # https://stackoverflow.com/a/41591258 This option is used only for Spark internal tests and is not to be used in production.
     ('spark. y', '100M'),
     # ('spark.driver.memory ', '200M'),
     # ('spark.executor.instances',1), # This property is no longer used in Spark 2+
     # number of executors is determined as: floor(spark.cores.max / spark.executor.cores)
     ("spark.executor.cores",1), # cores per executor. https://stackoverflow.com/questions/39399205/spark-standalone-number-executors-cores-control/39400195#39400195
     ("spark.cores.max", 2), # the maximum amount of CPU cores to request for the application from across the cluster (not from each machine)
     ('spark.submit.deployMode', 'client'), # client, cluster
     ('spark.ui.showConsoleProgress', 'true'),
     ("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") ])

<pyspark.conf.SparkConf at 0x7eca804c7490>

In [None]:
end=time.time()


f'Spark setup time: {int(end-start)} seconds'

'Spark setup time: 115 seconds'

In [None]:
#!unzip mysql-connector-j-8.2.0.zip

In [None]:
#Set up directories for later usage
!mkdir glucose_readings_dir device_readings_dir alerts_dir

mkdir: cannot create directory ‘glucose_readings_dir’: File exists
mkdir: cannot create directory ‘device_readings_dir’: File exists
mkdir: cannot create directory ‘alerts_dir’: File exists


## Generate Data
1. Take patient data stored in MySQL (e.g. ID)
2. Generate Glucse Readings Data — use timesynth library for better time-dependency of health data
3. Generate Device Readings Data — use Faker

In [None]:
import mysql.connector

# MySQL database credentials
host= DB_HOST
port= DB_PORT
username= DB_USERNAME
password= DB_PASSWORD
database= DB_NAME


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(user=username, password=password,
                              host=host, database=database)

cursor = cnx.cursor()
sql_str='SELECT COUNT(*) FROM Patient;'
rs=cursor.execute(sql_str)
rs=cursor.fetchall()
print(rs)

# Fetch existing device IDs
cursor.execute("SELECT device_id FROM DeviceStaticInfo")
device_ids = [row[0] for row in cursor.fetchall()]
print(device_ids)

cursor.execute("SELECT patient_id FROM Patient")
patient_ids = [row[0] for row in cursor.fetchall()]
print(patient_ids)

[(20,)]
[1, 2, 5, 3, 4]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [None]:
def generate_device_readings(iterations, device_ids):
  # Function to generate random firmware version
  def random_firmware_version():
      major = random.randint(1, 3)
      minor = random.randint(0, 9)
      patch = random.randint(0, 9)
      return f"{major}.{minor}.{patch}"

  # Function to generate random connectivity status
  def random_connectivity_status():
      return random.choice(['Connected', 'Disconnected', 'Poor Connection'])

  # Function to generate random error codes
  def random_error_codes():
      if random.choice([True, False]):
          return random.choice(['Err1', 'Err2', 'Err3', ''])
      return ''

  # List to store generated operational data
  operational_data = []

  # Generate and add device operational data to the list
  for i in range(iterations):
    for device_id in device_ids:
        for _ in range(1):  # Generate 1 record per device
            data = {
                'device_id': device_id,
                'timestamp': fake.date_time_between(start_date="-30d", end_date="now").strftime('%Y-%m-%d %H:%M:%S'),
                'battery_level': random.randint(0, 100),  # Battery level from 0 to 100%
                'firmware_version': random_firmware_version(),
                'connectivity_status': random_connectivity_status(),
                'error_codes': random_error_codes()
            }
            operational_data.append(data)

    # Write the generated operational data to a JSON file
    with open(f'./device_readings_dir/device_operational_data_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")}.json', 'w') as file:
        json.dump(operational_data, file, indent=4)
        print(f"Generated {len(operational_data)} device operational data records and saved them to '{file.name}'")

    #clear
    operational_data=[]

In [None]:
!git clone https://github.com/TimeSynth/TimeSynth.git
%cd TimeSynth
!pip install .
!pip install timesynth
%cd ..
#Then restart runtime

fatal: destination path 'TimeSynth' already exists and is not an empty directory.
/content/TimeSynth
Processing /content/TimeSynth
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: timesynth
  Building wheel for timesynth (setup.py) ... [?25l[?25hdone
  Created wheel for timesynth: filename=timesynth-0.2.4-py3-none-any.whl size=15422 sha256=12b55bf7ef3132bee314233acee5e48afa559bb510b5033f6b8ff89ca777cb8d
  Stored in directory: /tmp/pip-ephem-wheel-cache-dfk3s13e/wheels/81/84/c6/880288cb791ed65eb7343a49a4918038d6e3bf7622d3d187e0
Successfully built timesynth
Installing collected packages: timesynth
  Attempting uninstall: timesynth
    Found existing installation: timesynth 0.2.4
    Uninstalling timesynth-0.2.4:
      Successfully uninstalled timesynth-0.2.4
Successfully installed timesynth-0.2.4
/content


In [None]:
import timesynth as ts
import numpy as np
import random
from faker import Faker


def generate_glucose_readings(iterations, patient_ids, device_ids):
    # Define user profiles
    user_profiles = ['diabetic', 'athlete', 'party-goer', 'low-glucose', 'elderly']

    # Map patient IDs to user profiles
    patient_profile_map = {patient_id: random.choice(user_profiles) for patient_id in patient_ids}

    # TimeSynth setup
    time_sampler = ts.TimeSampler(stop_time=1)
    regular_time_samples = time_sampler.sample_regular_time(num_points=1)
    sinusoid = ts.signals.Sinusoidal(frequency=0.25)
    white_noise = ts.noise.GaussianNoise(std=0.3)
    timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)

    # Glucose reading generator based on user profile
    def generate_glucose_reading(profile):
      samples, _, _ = timeseries.sample(regular_time_samples)

      if profile == 'diabetic':
          return samples * 40 + 130  # Higher readings
      elif profile == 'athlete':
          return samples * 20 + 90   # Lower post-exercise readings
      elif profile == 'party-goer':
          return samples * 50 + 130  # Variable readings
      elif profile == 'low-glucose':
          return samples * 15 + 55   # Dangerously low readings
      elif profile == 'elderly':
          return samples * 35 + 145  # Steady/higher readings
      else:
          return samples * 40 + 110  # Normal range for other patients


    # List to store generated readings
    readings = []

    # Generate glucose readings
    for i in range(iterations):
        for patient_id in patient_ids:
            profile = patient_profile_map[patient_id]
            glucose_levels = generate_glucose_reading(profile)

            for glucose_level in glucose_levels:

                reading = {
                    'patient_id': patient_id,
                    'device_id': random.choice(device_ids),
                    'glucose_level': round(glucose_level, 2),
                    'timestamp': fake.date_time_between(start_date="-30d", end_date="now").strftime('%Y-%m-%d %H:%M:%S'),
                    'location': fake.city()
                }
                readings.append(reading)

        # Write the generated readings to a JSON file
        with open(f'./glucose_readings_dir/glucose_readings_corrected_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")}.json', 'w') as file:
            json.dump(readings, file, indent=4)
            print(f"Generated {iterations} glucose reading(s) for each patient and saved them to '{file.name}'")

        readings = []


In [None]:
#Generate Device Data: 1 reading per device — 1 json file will be generated with all readings
#by default we have 5 devices
generate_device_readings(1, device_ids)

#Generate Glucse Readings: 1 sample for each patient — 1 json file will be generated with all readings
#by default we have 20 patients
generate_glucose_readings(15, patient_ids, device_ids )

Generated 5 device operational data records and saved them to './device_readings_dir/device_operational_data_2024_01_02_22_16_23_182121.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_readings_corrected_2024_01_02_22_16_23_202632.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_readings_corrected_2024_01_02_22_16_23_212910.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_readings_corrected_2024_01_02_22_16_23_221518.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_readings_corrected_2024_01_02_22_16_23_230911.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_readings_corrected_2024_01_02_22_16_23_240336.json'
Generated 15 glucose reading(s) for each patient and saved them to './glucose_readings_dir/glucose_read

If you want to generate data in the background:

In [None]:
# Create a process to run the generate_data function in the background
device_readings_process = multiprocessing.Process(target=generate_device_readings, args=(1, device_ids))
glucose_readings_process = multiprocessing.Process(target=generate_glucose_readings, args=(1, patient_ids, device_ids))

# Start the process
device_readings_process.start()
glucose_readings_process.start()

Show Generated Data

In [None]:
import json

# Opening JSON file
f = open('glucose_readings_dir/glucose_readings_corrected_2024_01_02_11_42_06_426773.json')

# returns JSON object as
# a dictionary
data = json.load(f)

# Iterating through the json
# list
for i in data:
    print(i)

# Closing file
f.close()

{'patient_id': 1, 'device_id': 3, 'glucose_level': 107.58, 'timestamp': '2023-12-23 04:04:09', 'location': 'South John'}
{'patient_id': 2, 'device_id': 2, 'glucose_level': 119.24, 'timestamp': '2023-12-22 16:22:07', 'location': 'Reneeland'}
{'patient_id': 3, 'device_id': 2, 'glucose_level': 128.09, 'timestamp': '2023-12-08 11:26:49', 'location': 'Morgantown'}
{'patient_id': 4, 'device_id': 4, 'glucose_level': 147.17, 'timestamp': '2023-12-12 23:09:57', 'location': 'North Martinstad'}
{'patient_id': 5, 'device_id': 4, 'glucose_level': 135.7, 'timestamp': '2023-12-26 21:58:35', 'location': 'North Vincentmouth'}
{'patient_id': 6, 'device_id': 4, 'glucose_level': 153.98, 'timestamp': '2023-12-10 08:22:53', 'location': 'Christophershire'}
{'patient_id': 7, 'device_id': 1, 'glucose_level': 134.78, 'timestamp': '2023-12-19 12:37:24', 'location': 'West Sarah'}
{'patient_id': 8, 'device_id': 5, 'glucose_level': 126.55, 'timestamp': '2023-12-11 20:08:11', 'location': 'Shannonland'}
{'patient_id'

In [None]:
import json

# Opening JSON file
f = open('/content/device_readings_dir/device_operational_data_2024_01_02_11_46_16_130197.json')

# returns JSON object as
# a dictionary
data = json.load(f)

# Iterating through the json
# list
for i in data:
    print(i)

# Closing file
f.close()

{'device_id': 1, 'timestamp': '2023-12-30 08:09:24', 'battery_level': 57, 'firmware_version': '1.3.7', 'connectivity_status': 'Disconnected', 'error_codes': ''}
{'device_id': 2, 'timestamp': '2023-12-17 18:18:04', 'battery_level': 55, 'firmware_version': '2.7.2', 'connectivity_status': 'Poor Connection', 'error_codes': ''}
{'device_id': 5, 'timestamp': '2023-12-13 02:24:01', 'battery_level': 15, 'firmware_version': '2.9.0', 'connectivity_status': 'Disconnected', 'error_codes': ''}
{'device_id': 3, 'timestamp': '2024-01-02 05:18:39', 'battery_level': 68, 'firmware_version': '3.8.3', 'connectivity_status': 'Poor Connection', 'error_codes': ''}
{'device_id': 4, 'timestamp': '2023-12-08 20:35:51', 'battery_level': 79, 'firmware_version': '1.6.1', 'connectivity_status': 'Disconnected', 'error_codes': ''}


## Send Data to Eventhub

In [None]:
!pip install azure.eventhub

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("JSONtoEventHub") \
    .getOrCreate()

# schema definitions
glucose_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("device_id", IntegerType(), True),
    StructField("glucose_level", FloatType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("location", StringType(), True)
])

device_schema = StructType([
    StructField("device_id", IntegerType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("battery_level", FloatType(), True),
    StructField("firmware_version", StringType(), True),
    StructField("connectivity_status", StringType(), True),
    StructField("error_codes", StringType(), True)
])


# Read JSON file into DataFrame
json_df_glucose = spark.readStream.option('multiline', True).schema(glucose_schema).json("./glucose_readings_dir/")
json_df_device = spark.readStream.option('multiline', True).schema(device_schema).json("./device_readings_dir/")
#.option('multiline', True)

In [None]:
from azure.eventhub import EventHubProducerClient, EventData

connection_string = YOUR_CONNECTION_STRING

def send_to_eventhub_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():
        producer = EventHubProducerClient.from_connection_string(
            conn_str=connection_string,
            eventhub_name="glucose_monitoring"
        )
        with producer:
            event_data_batch = producer.create_batch()
            for row in batch_df.collect():
                event_data = EventData(str(row.asDict()))
                try:
                    # Add the event to the batch
                    event_data_batch.add(event_data)
                except ValueError:
                    # The batch is full, send it and start a new batch
                    producer.send_batch(event_data_batch)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(event_data)  # Add the event to the new batch
            # Send any remaining events in the batch
            if len(event_data_batch) > 0:
                producer.send_batch(event_data_batch)


# Streaming query (treating the static DataFrame as a stream)
query = json_df_glucose.writeStream \
    .foreachBatch(send_to_eventhub_batch) \
    .start()\
    .awaitTermination()

In [None]:
from azure.eventhub import EventHubProducerClient, EventData

connection_string = YOUR_CONNECTION_STRING

def send_to_eventhub_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():
        producer = EventHubProducerClient.from_connection_string(
            conn_str=connection_string,
            eventhub_name="device_readings"
        )
        with producer:
            event_data_batch = producer.create_batch()
            for row in batch_df.collect():
                event_data = EventData(str(row.asDict()))
                try:
                    # Add the event to the batch
                    event_data_batch.add(event_data)
                except ValueError:
                    # The batch is full, send it and start a new batch
                    producer.send_batch(event_data_batch)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(event_data)  # Add the event to the new batch
            # Send any remaining events in the batch
            if len(event_data_batch) > 0:
                producer.send_batch(event_data_batch)


# Streaming query (treating the static DataFrame as a stream)
query = json_df_device.writeStream \
    .foreachBatch(send_to_eventhub_batch) \
    .start()\
    .awaitTermination()

## (Optional) Filter Data for Alerts
1. Condition: (Glucose Reading > 115) OR (Rolling Window Average of 10 readings > 105)
2. Send filtered data to Eventhub
3. From Eventhub, Azure Functions picks data and send alert to Telegram channel.


How Code works:
1. Read the streaming data from JSON files.
2. Use window functions to calculate the rolling average. Since the files are in chronological order, make sure the files are ingested by the streaming job in the same order.
3. Apply the filter based on the rolling average and the individual glucose level readings.


This code will calculate the rolling average for the last 10 glucose readings for each patient, and it assumes that each file is treated as a separate micro-batch in chronological order. If a patient has fewer than 10 readings in the latest micro-batch, the average will be calculated over however many readings are available.



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType, ArrayType
from azure.eventhub import EventHubProducerClient, EventData
connection_string = YOUR_CONNECTION_STRING


# Initialize Spark Session for Streaming
spark = SparkSession.builder \
    .appName("GlucoseStreamingAnalysis") \
    .getOrCreate()

# Define schema for individual glucose reading
glucose_reading_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("device_id", IntegerType(), True),
    StructField("glucose_level", FloatType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("location", StringType(), True)
])

# Read streaming data from JSON files
json_df_glucose = spark.readStream \
    .option("multiLine", True) \
    .schema(StructType([StructField("readings", ArrayType(glucose_reading_schema))])) \
    .json("glucose_readings_dir/") \
    .select(explode(col("readings")).alias("reading")) \
    .select("reading.*")

# Filter records where glucose level is greater than 115
filtered_glucose_df = json_df_glucose.filter(col("glucose_level") > 115)


def send_to_eventhub_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():
        producer = EventHubProducerClient.from_connection_string(
            conn_str=connection_string,
            eventhub_name="alerts"
        )
        with producer:
            event_data_batch = producer.create_batch()
            for row in batch_df.collect():
                event_data = EventData(str(row.asDict()))
                try:
                    # Add the event to the batch
                    event_data_batch.add(event_data)
                except ValueError:
                    # The batch is full, send it and start a new batch
                    producer.send_batch(event_data_batch)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(event_data)  # Add the event to the new batch
            # Send any remaining events in the batch
            if len(event_data_batch) > 0:
                producer.send_batch(event_data_batch)

# Streaming query (treating the static DataFrame as a stream)
query = filtered_glucose_df.writeStream \
    .foreachBatch(send_to_eventhub_batch) \
    .start()\
    .awaitTermination()