In [None]:
spark_release='spark-3.4.2'
hadoop_version='hadoop3'

import os, time
start=time.time()
os.environ['SPARK_RELEASE']=spark_release
os.environ['HADOOP_VERSION']=hadoop_version
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_release}-bin-{hadoop_version}"

In [None]:
!pip install faker pysqlite3
!pip install mysql.connector
!pip install pyspark

## Spark

In [None]:
# Run below commands in google colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install Java8
!wget -q http://apache.osuosl.org/spark/${SPARK_RELEASE}/${SPARK_RELEASE}-bin-${HADOOP_VERSION}.tgz # download spark-3.3.X
!tar xf ${SPARK_RELEASE}-bin-${HADOOP_VERSION}.tgz # unzip it
!pip install -q findspark # install findspark

In [None]:
import multiprocessing
import multiprocessing
import pyspark
import socket
import uuid
import findspark
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.streaming import DataStreamReader

import sqlite3
from faker import Faker
import random
import datetime
import json
fake=Faker()

In [None]:
# findspark find your Spark Distribution and sets necessary environment variables
findspark.init()

In [None]:
# Check the pyspark version
print(pyspark.__version__)

3.5.0


In [None]:
# Create a DStream that will connect to hostname:port, like localhost:9999
# if doing this over a network, firewalls may block the connection!
hostname=socket.gethostname()

hostname

'95e7630eff48'

In [None]:
app_id=str(uuid.uuid1())

app_id

'88229fc8-a95d-11ee-8060-0242ac1c000c'

In [None]:
conf = SparkConf()

conf.setAll([
     ('spark.app.name', app_id),
     ('spark.shuffle.useOldFetchProtocol', 'true'),
     ('spark.testing', 'true'), # Avoid minimum 450M executor/driver memory https://www.waitingforcode.com/apache-spark/troubleshooting-system-memory-must-be-at-least-error/read / https://programmerclick.com/article/72821685476/
     ('spark.driver.allowMultipleContexts','true'), # https://stackoverflow.com/a/41591258 This option is used only for Spark internal tests and is not to be used in production.
     ('spark. y', '100M'),
     # ('spark.driver.memory ', '200M'),
     # ('spark.executor.instances',1), # This property is no longer used in Spark 2+
     # number of executors is determined as: floor(spark.cores.max / spark.executor.cores)
     ("spark.executor.cores",1), # cores per executor. https://stackoverflow.com/questions/39399205/spark-standalone-number-executors-cores-control/39400195#39400195
     ("spark.cores.max", 2), # the maximum amount of CPU cores to request for the application from across the cluster (not from each machine)
     ('spark.submit.deployMode', 'client'), # client, cluster
     ('spark.ui.showConsoleProgress', 'true'),
     ("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") ])

<pyspark.conf.SparkConf at 0x7c0f18a20c10>

In [None]:
end=time.time()


f'Spark setup time: {int(end-start)} seconds'

'Spark setup time: 122 seconds'

In [None]:
#!unzip mysql-connector-j-8.2.0.zip

In [None]:
#Set up directories for later usage
!mkdir glucose_readings_dir device_readings_dir alerts_dir

## Generate Data
1. Take patient data stored in MySQL (e.g. ID)
2. Generate Glucse Readings Data — use timesynth library for better time-dependency of health data
3. Generate Device Readings Data — use Faker

### Device Data

1. Random Firmware Version Function: This function generates a random firmware version number. It creates a version string in the format major.minor.patch, where each part is a random integer (major is between 1 and 3, minor and patch are between 0 and 9).

2. Random Connectivity Status Function: This function randomly selects a connectivity status for a device from three options: 'Connected', 'Disconnected', or 'Poor Connection'.

3. Random Error Codes Function: This function randomly decides whether to assign an error code to a device. If an error code is to be assigned, it randomly selects one from 'Err1', 'Err2', 'Err3', or an empty string (representing no error).

4. Operational Data Generation:
* The process is repeated for the number of iterations specified.
* For each device in the device_ids list, the script generates one record of operational data. This data includes:
  * The device_id.
  * A timestamp generated using the Faker library to simulate a datetime within the last 30 days.
  * A battery_level, which is a random integer from 0 to 100%.
  * The firmware_version, generated by the random_firmware_version function.
  * The connectivity_status, generated by the random_connectivity_status function.
  * Any error_codes, generated by the random_error_codes function.

5. Writing to JSON File:

* The generated data for each iteration is added to the operational_data list.
* After each iteration, the data is written to a JSON file, named uniquely using a timestamp.
* A message is printed to indicate the successful generation and saving of the operational data records.
* The operational_data list is cleared at the end of each iteration to prepare for the next set of data generation.


In [None]:
import mysql.connector

# MySQL database credentials
host= DB_HOST
port= DB_PORT
username= DB_USERNAME
password= DB_PASSWORD
database= DB_NAME


# Establish a connection to the MySQL database
cnx = mysql.connector.connect(user=username, password=password,
                              host=host, database=database)

cursor = cnx.cursor()
sql_str='SELECT COUNT(*) FROM Patient;'
rs=cursor.execute(sql_str)
rs=cursor.fetchall()
print(rs)

# Fetch existing device IDs
cursor.execute("SELECT device_id FROM DeviceStaticInfo")
device_ids = [row[0] for row in cursor.fetchall()]
print(device_ids)

cursor.execute("SELECT patient_id FROM Patient")
patient_ids = [row[0] for row in cursor.fetchall()]
print(patient_ids)

[(20,)]
[1, 2, 5, 3, 4]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


### Glucose Data

1. Defining User Profiles: Five user profiles are created - 'diabetic', 'athlete', 'party-goer', 'low-glucose', and 'elderly'. These represent different types of individuals who might have varying glucose levels.

2. Mapping Patient IDs to User Profiles: Each patient ID from the provided patient_ids list is randomly assigned one of the user profiles. This establishes a variety of simulated patients.

3. TimeSynth Setup: TimeSynth, a library used for generating synthetic time series data, is set up here. It samples a regular time point and creates a sinusoidal signal with added Gaussian noise. This setup is used to simulate the natural fluctuations in glucose levels over time.

4. Glucose Reading Generation: For each profile, a function generate_glucose_reading is defined, which generates glucose readings based on the profile's characteristics:
  * Diabetic: Higher readings.
  * Athlete: Lower post-exercise readings.
  * Party-goer: Variable readings.
  * Low-glucose: Dangerously low readings.
  * Elderly: Steady/higher readings.

The readings are modified based on the sinusoidal signal and noise, simulating natural variations.

5. Generating Readings:

  * Iterations: The code runs for the number of iterations specified.
  * For each iteration and each patient, the corresponding profile is identified, and glucose readings are generated based on that profile.
  * Each reading includes the patient's ID, a randomly chosen device ID from device_ids, the glucose level (rounded to 2 decimal places), a timestamp (generated using the Faker library to simulate a datetime within the last 30 days), and a location (also generated by Faker).

6. Writing to JSON:
  * After generating readings for each patient in an iteration, the data is written to a JSON file.
  * The file is named with a timestamp to ensure uniqueness.
  * After writing the data, the readings list is reset for the next iteration.

In [None]:
def generate_device_readings(iterations, device_ids):
  # Function to generate random firmware version
  def random_firmware_version():
      major = random.randint(1, 3)
      minor = random.randint(0, 9)
      patch = random.randint(0, 9)
      return f"{major}.{minor}.{patch}"

  # Function to generate random connectivity status
  def random_connectivity_status():
      return random.choice(['Connected', 'Disconnected', 'Poor Connection'])

  # Function to generate random error codes
  def random_error_codes():
      if random.choice([True, False]):
          return random.choice(['Err1', 'Err2', 'Err3', ''])
      return ''

  # List to store generated operational data
  operational_data = []

  # Generate and add device operational data to the list
  for i in range(iterations):
    for device_id in device_ids:
        for _ in range(1):  # Generate 1 record per device
            data = {
                'device_id': device_id,
                'timestamp': fake.date_time_between(start_date="-30d", end_date="now").strftime('%Y-%m-%d %H:%M:%S'),
                'battery_level': random.randint(0, 100),  # Battery level from 0 to 100%
                'firmware_version': random_firmware_version(),
                'connectivity_status': random_connectivity_status(),
                'error_codes': random_error_codes()
            }
            operational_data.append(data)

    # Write the generated operational data to a JSON file
    with open(f'./device_readings_dir/device_operational_data_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")}.json', 'w') as file:
        json.dump(operational_data, file, indent=4)
        print(f"Generated {len(operational_data)} device operational data records and saved them to '{file.name}'")

    #clear
    operational_data=[]

In [None]:
!git clone https://github.com/TimeSynth/TimeSynth.git
%cd TimeSynth
!pip install .
!pip install timesynth
%cd ..
#Then restart runtime

In [None]:
import timesynth as ts
import numpy as np
import random
from faker import Faker


def generate_glucose_readings(iterations, patient_ids, device_ids):
    # Define user profiles
    user_profiles = ['diabetic', 'athlete', 'party-goer', 'low-glucose', 'elderly']

    # Map patient IDs to user profiles
    patient_profile_map = {patient_id: random.choice(user_profiles) for patient_id in patient_ids}

    # TimeSynth setup
    time_sampler = ts.TimeSampler(stop_time=1)
    regular_time_samples = time_sampler.sample_regular_time(num_points=1)
    sinusoid = ts.signals.Sinusoidal(frequency=0.25)
    white_noise = ts.noise.GaussianNoise(std=0.3)
    timeseries = ts.TimeSeries(sinusoid, noise_generator=white_noise)

    # Glucose reading generator based on user profile
    def generate_glucose_reading(profile):
      samples, _, _ = timeseries.sample(regular_time_samples)

      if profile == 'diabetic':
          return samples * 40 + 130  # Higher readings
      elif profile == 'athlete':
          return samples * 20 + 90   # Lower post-exercise readings
      elif profile == 'party-goer':
          return samples * 50 + 130  # Variable readings
      elif profile == 'low-glucose':
          return samples * 15 + 55   # Dangerously low readings
      elif profile == 'elderly':
          return samples * 35 + 145  # Steady/higher readings
      else:
          return samples * 40 + 110  # Normal range for other patients


    # List to store generated readings
    readings = []

    # Generate glucose readings
    for i in range(iterations):
        for patient_id in patient_ids:
            profile = patient_profile_map[patient_id]
            glucose_levels = generate_glucose_reading(profile)

            for glucose_level in glucose_levels:

                reading = {
                    'patient_id': patient_id,
                    'device_id': random.choice(device_ids),
                    'glucose_level': round(glucose_level, 2),
                    'timestamp': fake.date_time_between(start_date="-30d", end_date="now").strftime('%Y-%m-%d %H:%M:%S'),
                    'location': fake.city()
                }
                readings.append(reading)

        # Write the generated readings to a JSON file
        with open(f'./glucose_readings_dir/glucose_readings_corrected_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")}.json', 'w') as file:
            json.dump(readings, file, indent=4)
            print(f"Generated {iterations} glucose reading(s) for each patient and saved them to '{file.name}'")

        readings = []


In [None]:
#Generate Device Data: 1 reading per device — 1 json file will be generated with all readings
#by default we have 5 devices
generate_device_readings(1, device_ids)

#Generate Glucse Readings: 1 sample for each patient — 1 json file will be generated with all readings
#by default we have 20 patients
generate_glucose_readings(1, patient_ids, device_ids )

If you want to generate data in the background:

In [None]:
# Create a process to run the generate_data function in the background
device_readings_process = multiprocessing.Process(target=generate_device_readings, args=(1, device_ids))
glucose_readings_process = multiprocessing.Process(target=generate_glucose_readings, args=(1, patient_ids, device_ids))

# Start the process
device_readings_process.start()
glucose_readings_process.start()

Show Generated Data

In [None]:
import json

# Opening JSON file
f = open('glucose_readings_dir/glucose_readings_corrected_2024_01_02_11_42_06_426773.json')

# returns JSON object as
# a dictionary
data = json.load(f)

# Iterating through the json
# list
for i in data:
    print(i)

# Closing file
f.close()

{'patient_id': 1, 'device_id': 3, 'glucose_level': 107.58, 'timestamp': '2023-12-23 04:04:09', 'location': 'South John'}
{'patient_id': 2, 'device_id': 2, 'glucose_level': 119.24, 'timestamp': '2023-12-22 16:22:07', 'location': 'Reneeland'}
{'patient_id': 3, 'device_id': 2, 'glucose_level': 128.09, 'timestamp': '2023-12-08 11:26:49', 'location': 'Morgantown'}
{'patient_id': 4, 'device_id': 4, 'glucose_level': 147.17, 'timestamp': '2023-12-12 23:09:57', 'location': 'North Martinstad'}
{'patient_id': 5, 'device_id': 4, 'glucose_level': 135.7, 'timestamp': '2023-12-26 21:58:35', 'location': 'North Vincentmouth'}
{'patient_id': 6, 'device_id': 4, 'glucose_level': 153.98, 'timestamp': '2023-12-10 08:22:53', 'location': 'Christophershire'}
{'patient_id': 7, 'device_id': 1, 'glucose_level': 134.78, 'timestamp': '2023-12-19 12:37:24', 'location': 'West Sarah'}
{'patient_id': 8, 'device_id': 5, 'glucose_level': 126.55, 'timestamp': '2023-12-11 20:08:11', 'location': 'Shannonland'}
{'patient_id'

In [None]:
import json

# Opening JSON file
f = open('/content/device_readings_dir/device_operational_data_2024_01_02_11_46_16_130197.json')

# returns JSON object as
# a dictionary
data = json.load(f)

# Iterating through the json
# list
for i in data:
    print(i)

# Closing file
f.close()

{'device_id': 1, 'timestamp': '2023-12-30 08:09:24', 'battery_level': 57, 'firmware_version': '1.3.7', 'connectivity_status': 'Disconnected', 'error_codes': ''}
{'device_id': 2, 'timestamp': '2023-12-17 18:18:04', 'battery_level': 55, 'firmware_version': '2.7.2', 'connectivity_status': 'Poor Connection', 'error_codes': ''}
{'device_id': 5, 'timestamp': '2023-12-13 02:24:01', 'battery_level': 15, 'firmware_version': '2.9.0', 'connectivity_status': 'Disconnected', 'error_codes': ''}
{'device_id': 3, 'timestamp': '2024-01-02 05:18:39', 'battery_level': 68, 'firmware_version': '3.8.3', 'connectivity_status': 'Poor Connection', 'error_codes': ''}
{'device_id': 4, 'timestamp': '2023-12-08 20:35:51', 'battery_level': 79, 'firmware_version': '1.6.1', 'connectivity_status': 'Disconnected', 'error_codes': ''}


## Send Data to Eventhub
1. Define Schema
2. Connect to Eventhub
3. Send data in constant manner by checking the folder with generated data (JSONs)

**The Process**
1. Connection String: The variable connection_string is set to store the connection string for the Azure Event Hub. This connection string is required to authenticate and establish a connection with your Event Hub instance.

2. Function send_to_eventhub_batch:

* The function takes two arguments: batch_df, which is a Spark DataFrame representing a batch of data, and batch_id, which identifies the batch (though batch_id isn't explicitly used in the function).
* First, the function checks if the DataFrame batch_df is not empty.
* It then creates an EventHubProducerClient instance using the connection string and the name of the Event Hub (YOUR_EVENTHUB_NAME).
* Within a with statement (which ensures proper resource management), a batch of events is created using producer.create_batch().

* The function iterates over each row in the DataFrame. For each row, it:
  * Converts the row to a dictionary and then to a string.
  * Creates an EventData object from this string.
  * Tries to add the EventData object to the current event data batch.
  * If the batch is full (indicated by a ValueError), the batch is sent using producer.send_batch(), and a new batch is created to add the event.
* After all rows are processed, any remaining events in the current batch are sent to Event Hubs.

3. Streaming Query: The code treats a static DataFrame (json_df_glucose) as a stream.
* It defines a streaming query using the writeStream method. The query uses the foreachBatch method, which applies the send_to_eventhub_batch function to each batch of data in the stream.
* The start() method starts the streaming query, and awaitTermination() keeps the query running until it's either stopped manually or an error occurs.


In [None]:
!pip install azure.eventhub

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("JSONtoEventHub") \
    .getOrCreate()

# schema definitions
glucose_schema = StructType([
    StructField("patient_id", IntegerType(), True),
    StructField("device_id", IntegerType(), True),
    StructField("glucose_level", FloatType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("location", StringType(), True)
])

device_schema = StructType([
    StructField("device_id", IntegerType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("battery_level", FloatType(), True),
    StructField("firmware_version", StringType(), True),
    StructField("connectivity_status", StringType(), True),
    StructField("error_codes", StringType(), True)
])


# Read JSON file into DataFrame
json_df_glucose = spark.readStream.option('multiline', True).schema(glucose_schema).json("./glucose_readings_dir/")
json_df_device = spark.readStream.option('multiline', True).schema(device_schema).json("./device_readings_dir/")
#.option('multiline', True)

In [None]:
from azure.eventhub import EventHubProducerClient, EventData

connection_string = YOUR_CONNECTION_STRING

def send_to_eventhub_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():
        producer = EventHubProducerClient.from_connection_string(
            conn_str=connection_string,
            eventhub_name=YOUR_EVENTHUB_NAME
        )
        with producer:
            event_data_batch = producer.create_batch()
            for row in batch_df.collect():
                event_data = EventData(str(row.asDict()))
                try:
                    # Add the event to the batch
                    event_data_batch.add(event_data)
                except ValueError:
                    # The batch is full, send it and start a new batch
                    producer.send_batch(event_data_batch)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(event_data)  # Add the event to the new batch
            # Send any remaining events in the batch
            if len(event_data_batch) > 0:
                producer.send_batch(event_data_batch)


# Streaming query (treating the static DataFrame as a stream)
query = json_df_glucose.writeStream \
    .foreachBatch(send_to_eventhub_batch) \
    .start()\
    .awaitTermination()

In [None]:
from azure.eventhub import EventHubProducerClient, EventData

connection_string = YOUR_CONNECTION_STRING

def send_to_eventhub_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():
        producer = EventHubProducerClient.from_connection_string(
            conn_str=connection_string,
            eventhub_name= YOUR_EVENTHUB_NAME
        )
        with producer:
            event_data_batch = producer.create_batch()
            for row in batch_df.collect():
                event_data = EventData(str(row.asDict()))
                try:
                    # Add the event to the batch
                    event_data_batch.add(event_data)
                except ValueError:
                    # The batch is full, send it and start a new batch
                    producer.send_batch(event_data_batch)
                    event_data_batch = producer.create_batch()
                    event_data_batch.add(event_data)  # Add the event to the new batch
            # Send any remaining events in the batch
            if len(event_data_batch) > 0:
                producer.send_batch(event_data_batch)


# Streaming query (treating the static DataFrame as a stream)
query = json_df_device.writeStream \
    .foreachBatch(send_to_eventhub_batch) \
    .start()\
    .awaitTermination()