# PROCESSING OF SPATIAL DATA USING KAFKA

This Jupyter Notebook can be used to play around with the interpolation script that we will experiment on the streamed data in the previous step. We will perform two kinds of streaming implementation

**- Event Detection:** In this example we will check the average of PM levels over 3 days for each location. If this average exceeds a value of 15, we will then trigger an event/notification for the user. This threshold is defined in the next cell and can be changed

**- Geo-Plotting:** In this example you will learn how the streamed data can be plotted on maps and how usefull it is to monitor IoT devices using real-time data.

In [1]:
pm_threshold = 20

In [12]:
## Import Libraries

import warnings
warnings.filterwarnings('ignore')

import geopandas as gpd
import json, math
import numpy as np
from functools import partial
from geocube.api.core import make_geocube
from geocube.rasterize import rasterize_points_griddata
import geojson
import pandas as pd
import sys
import time
import socket
from confluent_kafka import Consumer, KafkaError, KafkaException, Producer

from ipyleaflet import Map, basemaps, WidgetControl, Marker, basemap_to_tiles, DrawControl, GeoJSON, MarkerCluster, AwesomeIcon
from ipywidgets import IntSlider, ColorPicker, jslink

## Define Icons for Map

icon_active = AwesomeIcon(
    name='map-marker',
    marker_color='green',
    icon_color='green',
    spin=False
)

icon_inactive = AwesomeIcon(
    name='map-marker',
    marker_color='red',
    icon_color='red',
    spin=False
)

## KAFKA CONSUMER DEFINITION

In [3]:
### START: AVOID MAKING CHANGES ###

'''
Offset decides in what order to consume the message. "smallest" means read the first message that was sent at 1st position and then the others.
"largest" will mean to read the most 'recent' message in 1st position and then others in the same order
'''

conf = {'bootstrap.servers': 'kafka:9093',
        'default.topic.config': {'auto.offset.reset': 'smallest'},
        'group.id': socket.gethostname()}

### END: AVOID MAKING CHANGES ###

In [4]:
## Set topic name as set in sendStream.py
topic = "pm25_stream"

## Kafka streamed data will be stored here
df = pd.DataFrame(columns=['lat','lon','value','day','boxId'])

Initialise the consumer and subscribe to the topic

In [5]:
consumer = Consumer(conf)
consumer.subscribe([topic])

running = True

Define functions that will be used for real-time processing

In [6]:
def event_response(timestamp, pm, sensebox):
    return str(sensebox)+" : "+str(timestamp)+" : !!! WARNING !!! PM 2.5 threshold exceeded with 3-Day Average of "+str(pm)

    '''
    You can insert an email trigger script after this comment
    '''
    
def event_notification(df, pm_threshold):
    
    '''
    Function to handle event notifications in real-time.
    Checks if the rolling average of PM Levels for 3 days exceeds a certain threshold
    '''
    
    try:
                
        ## Get rolling average of pm value by lat/lon over last 3 days
        rolling_average = df.groupby(['lat','lon']).rolling(3)['value'].mean().reset_index()
        rolling_average.dropna(inplace=True)
        
        current_lat = df.iloc[df.shape[0] - 1,:].lat
        current_lon = df.iloc[df.shape[0] - 1,:].lon
        
        rolling_average_index = rolling_average[(rolling_average['lat'] == current_lat) & (rolling_average['lon'] == current_lon)].index[-1]
        
        pm_value = rolling_average.loc[rolling_average_index, 'value']
        timestamp = df.iloc[df.shape[0] - 1,:]['day']
        sensebox = df.iloc[df.shape[0] - 1,:]['boxId']
            
        ## Trigger check
        if pm_value > pm_threshold:
            
            ## Trigger notification
            response = event_response(timestamp, round(pm_value,2), sensebox)
            print(response)
        
        else:
            
            ## PM levels are safe
            print(str(sensebox)+" : "+str(timestamp)+" : Message Received PM 2.5 Levels are safe")

    except:
        pass

Trigger the Kafka Consumer, the infinite loop will automatically break if no message is received for more than **10 seconds**

### EVENT DETECTION

In [7]:
try:
    while running:

        msg = consumer.poll(timeout=10) # wait 10 seconds before exit. If no messages are received for 10 seconds, consuming will stop 
        
        if msg is None:
            break

        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                # End of partition event
                sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                    (msg.topic(), msg.partition(), msg.offset()))
            elif msg.error().code() == KafkaError.UNKNOWN_TOPIC_OR_PART:
                sys.stderr.write('Topic unknown, creating %s topic\n' %
                                    (topic))
            elif msg.error():
                raise KafkaException(msg.error())
        else:
                        
            input = json.loads(msg.value())
            key = list(input.keys())[0]
            
            stream = {
                'lat': input[key][0],
                'lon': input[key][1],
                'day': input[key][2],
                'value':  float(key),
                'boxId': input[key][3]
            }

            df = df.append(stream, ignore_index = True)
            
            ### EVENT NOTIFICATION SECTION: START ###
            
            event_notification(df, pm_threshold)
            
            ### EVENT NOTIFICATION SECTION: END ###
            
        ## Commit enables processing of a message only once, meaning drops any duplicates, however, you may lose messages that
        ## were not sent for some failure and will not be re-tried. Removing this command is possible but will require further
        ## changes to this script to perform manual de-duplication
        consumer.commit()

except KeyboardInterrupt:
    pass

finally:
    consumer.close()
    
    ## Note: Re-running this cell will note pull the data again as it is already pulled and the consumer is closed. You should
    ## re-run the 'sendStream.py' file to send the data again and then restart this notebook

5750220bed08f9680c6b4154 : 2022-01-17T09:00:00.000Z : Message Received PM 2.5 Levels are safe
5750220bed08f9680c6b4154 : 2022-01-18T10:00:00.000Z : Message Received PM 2.5 Levels are safe
5750220bed08f9680c6b4154 : 2022-01-19T11:00:00.000Z : Message Received PM 2.5 Levels are safe
5750220bed08f9680c6b4154 : 2022-01-20T12:00:00.000Z : Message Received PM 2.5 Levels are safe
5750220bed08f9680c6b4154 : 2022-01-21T13:00:00.000Z : Message Received PM 2.5 Levels are safe
591f578c51d34600116a8ea5 : 2022-01-20T12:00:00.000Z : Message Received PM 2.5 Levels are safe
591f578c51d34600116a8ea5 : 2022-01-21T13:00:00.000Z : Message Received PM 2.5 Levels are safe
59ad958fd67eb50011b85f6d : 2022-01-16T08:00:00.000Z : Message Received PM 2.5 Levels are safe
59ad958fd67eb50011b85f6d : 2022-01-17T09:00:00.000Z : Message Received PM 2.5 Levels are safe
59ad958fd67eb50011b85f6d : 2022-01-18T10:00:00.000Z : Message Received PM 2.5 Levels are safe
59ad958fd67eb50011b85f6d : 2022-01-19T11:00:00.000Z : Messag

In [8]:
## Read the output of the streamed file
df.head()

Unnamed: 0,lat,lon,value,day,boxId
0,51.956168,7.651169,26.9782,2022-01-14T06:00:00.000Z,5750220bed08f9680c6b4154
1,51.956168,7.651169,24.967154,2022-01-15T07:00:00.000Z,5750220bed08f9680c6b4154
2,51.956168,7.651169,23.8172,2022-01-16T08:00:00.000Z,5750220bed08f9680c6b4154
3,51.956168,7.651169,7.263709,2022-01-17T09:00:00.000Z,5750220bed08f9680c6b4154
4,51.956168,7.651169,15.919987,2022-01-18T10:00:00.000Z,5750220bed08f9680c6b4154


In the above dataframe, we have data for three different days for each of the 10 locations

In [9]:
## Check how many values are present for each day
df['day'].value_counts()

nan                         4
2022-01-14T06:00:00.000Z    3
2022-01-15T07:00:00.000Z    3
2022-01-16T08:00:00.000Z    3
2022-01-17T09:00:00.000Z    3
2022-01-18T10:00:00.000Z    3
2022-01-19T11:00:00.000Z    3
2022-01-20T12:00:00.000Z    3
2022-01-21T13:00:00.000Z    3
Name: day, dtype: int64

In [10]:
# Convert Pandas to GeoPandas
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat))
gdf.set_crs(epsg=4326, inplace=True, allow_override=True)
gdf.drop(['lon','lat'], axis=1, inplace=True)
gdf.head()

Unnamed: 0,value,day,boxId,geometry
0,26.9782,2022-01-14T06:00:00.000Z,5750220bed08f9680c6b4154,POINT (7.65117 51.95617)
1,24.967154,2022-01-15T07:00:00.000Z,5750220bed08f9680c6b4154,POINT (7.65117 51.95617)
2,23.8172,2022-01-16T08:00:00.000Z,5750220bed08f9680c6b4154,POINT (7.65117 51.95617)
3,7.263709,2022-01-17T09:00:00.000Z,5750220bed08f9680c6b4154,POINT (7.65117 51.95617)
4,15.919987,2022-01-18T10:00:00.000Z,5750220bed08f9680c6b4154,POINT (7.65117 51.95617)


### SENSEBOX PLOTTING

In this section we will plot all the senseboxes, however, based on two conditions:

1. Plot senseboxes (In Green) that are live/returned values on the most recent date 
2. Plot senseboxes (In Red) that are down/did not return the values for most recent date

Using this real-time map visualization we can observe which sensors are actively streaming data and what are their locations

In [13]:
gdf['valid'] = gdf['value'].apply(lambda x: False if math.isnan(x) == True else True)

In [14]:
## Get most recent date

valid_boxes = gdf[gdf['valid'] == True]
recent_date = valid_boxes['day'].max()

In [15]:
active_boxes = gdf[gdf['day'] == recent_date][['boxId','geometry']]
active_boxes.drop_duplicates(subset=['boxId'], inplace=True)
active_boxes = active_boxes[['geometry']]
active_boxes['status'] = 'active'
active_boxes

Unnamed: 0,geometry,status
7,POINT (7.65117 51.95617),active
15,POINT (7.64522 51.96422),active
24,POINT (7.63528 51.90300),active


In [16]:
inactive_boxes = gdf[gdf['valid'] == False].drop_duplicates(subset=['geometry'])[['geometry']]
inactive_boxes['status'] = 'inactive'
inactive_boxes

Unnamed: 0,geometry,status
16,POINT (7.68419 51.92934),inactive
25,POINT (7.62677 51.94632),inactive
26,POINT (7.64146 51.95335),inactive
27,POINT (7.64143 51.96043),inactive


Setup marker icons to display the inactive and active sensors separately

In [17]:
## Create a cluster of active senseboxes as points

active_markers = []
for coords in active_boxes['geometry']:
    
    active_markers.append(
        Marker(location=(coords.y, coords.x), icon=icon_active, draggable=False)
    )

## Create a cluster of inactive senseboxes as points

inactive_markers = []
for coords in inactive_boxes['geometry']:
    
    inactive_markers.append(
        Marker(location=(coords.y, coords.x), icon=icon_inactive, draggable=False)
    )


In [18]:
lat = 51.9500023
lng = 7.6240147

center = (lat, lng)

m = Map(center=center, zoom=11)

active_boxes_cluster = MarkerCluster(
    markers=tuple(active_markers)
)

inactive_boxes_cluster = MarkerCluster(
    markers=tuple(inactive_markers)
)

m.add_layer(active_boxes_cluster)
m.add_layer(inactive_boxes_cluster)

display(m)

Map(center=[51.9500023, 7.6240147], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title'…

#### END
