## Batch processing of Air Quality Data for Anomaly detection

In this notebook, you will learn how you can use Python and popular data stack packages such as `river`, `sklearn` as well as `json`, `pandas` and `numpy`.

### About the Data

We obtained a sample of the data from the [Purple AIR API](https://community.purpleair.com/t/making-api-calls-with-the-purpleair-api/180)

In this notebook you can find code to clean the data, perform missing values imputation, and detect anomalies using River.

In [1]:
!pip install bytewax==0.19 python-dotenv scipy==1.13.0 kafka-python==2.0.2
!pip install pandas==2.0.3 river



In [2]:
!pip install scikit-learn==1.4.2



In [10]:
import requests
import json
from datetime import datetime, timezone
from river import anomaly
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

def impute_data_with_knn(deserialized_data):
    """
    Takes a list of dictionaries from deserialized data, converts it into a DataFrame,
    performs KNN imputation, and converts it back into a list of dictionaries.

    Args:
    deserialized_data: A list of dictionaries containing sensor data.

    Returns:
    A list of dictionaries with imputed data.
    """
    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(deserialized_data)

    # Ensure all numeric columns are in appropriate data types
    for column in df.columns:
        if df[column].dtype == 'object':
            try:
                df[column] = pd.to_numeric(df[column])
            except ValueError:
                continue  # Keep non-convertible columns as object if needed

    # Apply KNN imputer to numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    imputer = KNNImputer(n_neighbors=5, weights='uniform')
    imputed_array = imputer.fit_transform(df[numeric_columns])

    # Update numeric columns in DataFrame with imputed values
    df[numeric_columns] = imputed_array

    # Convert DataFrame back to a list of dictionaries
    imputed_data = df.to_dict(orient='records')

    return imputed_data

def serialize(data):
    headers = data['fields']
    serialized_data = []

    for entry in data['data']:
        try:
            # Create a dictionary for each entry, matching fields with values
            entry_data = {headers[i]: entry[i] for i in range(len(headers))}
            # Convert the dictionary to a JSON string and then encode it to bytes
            entry_bytes = json.dumps(entry_data).encode('utf-8')
            serialized_data.append(entry_bytes)
        except IndexError:
            # This block catches cases where the entry might not have all the fields
            print("IndexError with entry:", entry)
            continue

    return serialized_data


def deserialize(byte_objects_list):
    results = []  # List to hold the processed sensor data
    for byte_object in byte_objects_list:
        if byte_object:  # Check if byte_object is not empty
            sensor_data = json.loads(byte_object.decode('utf-8'))  # Decode and load JSON from bytes

            # Convert "pm2.5_cf_1" to a float, check if the value exists and is not None
            if 'pm2.5_cf_1' in sensor_data and sensor_data['pm2.5_cf_1'] is not None:
                sensor_data['pm2.5_cf_1'] = float(sensor_data['pm2.5_cf_1'])

            # Convert "date_created" from Unix epoch time to a datetime object, check if the value exists
            if 'date_created' in sensor_data and sensor_data['date_created'] is not None:
                sensor_data['date_created'] = datetime.fromtimestamp(sensor_data['date_created'], tz=timezone.utc)

            results.append(sensor_data)  # Add the processed data to the results list

    return results

class AnomalyDetector:
    def __init__(self, n_trees=10, height=8, window_size=72, seed=11):
        self.detector = anomaly.HalfSpaceTrees(
            n_trees=n_trees,
            height=height,
            window_size=window_size,
            limits={'x': (0.0, 1200)},  # ensure these limits make sense for your data
            seed=seed
        )

    def update(self, data):
        # Check if 'pm1.0_cf_1' is not None and is a floatable type
        if data['pm1.0_cf_1'] is not None:
            try:
                value = float(data['pm1.0_cf_1'])
                score = self.detector.score_one({'x': value})
                self.detector.learn_one({'x': value})
                data['score'] = score
            except ValueError:
                print(f"Skipping entry, invalid data for pm1.0_cf_1: {data['pm1.0_cf_1']}")
        else:
            print(f"Skipping entry, missing data for pm1.0_cf_1: {data}")
        return data





In [5]:

# Opening JSON file
f = open('data.json')

# returns JSON object as
# a dictionary
data = json.load(f)

In [3]:
len(data['data'])

25552

In [6]:
# Begin data processing
# Serialize the data to bytes
serialized_entries = serialize(data)
# Deserialize the data and transform epoch
deserialized_data = deserialize(serialized_entries)


In [11]:
# Perform KNN imputation on deserialized data
imputed_data = impute_data_with_knn(deserialized_data)


We can identify outliers after applying the model as seen below.

In [18]:
anomaly_detector = AnomalyDetector(n_trees=4, height=3, window_size=50, seed=11)

# Iterate over each deserialized data entry
for entry in imputed_data:
    updated_entry = anomaly_detector.update(entry)
    if updated_entry['score']>0.7:
      print(updated_entry)


{'sensor_index': 1970.0, 'date_created': Timestamp('2017-07-11 18:58:30+0000', tz='UTC'), 'rssi': -76.0, 'uptime': 17419.0, 'latitude': 33.99827, 'longitude': -118.437546, 'humidity': 34.0, 'temperature': 86.0, 'pressure': 1013.58, 'pm1.0': 2759.8, 'pm2.5_alt': 0.0, 'pm10.0': 2759.8, 'pm1.0_cf_1': 4139.3, 'pm2.5_atm': 2759.8, 'pm2.5_cf_1': 4139.3, 'pm10.0_cf_1': 4139.3, 'score': 0.9333333333333333}
{'sensor_index': 13907.0, 'date_created': Timestamp('2018-08-01 20:40:19+0000', tz='UTC'), 'rssi': -79.0, 'uptime': 26054.0, 'latitude': 37.85105, 'longitude': -122.27175, 'humidity': 27.0, 'temperature': 92.0, 'pressure': 1013.48, 'pm1.0': 3330.9, 'pm2.5_alt': 0.0, 'pm10.0': 3330.9, 'pm1.0_cf_1': 4997.0, 'pm2.5_atm': 3330.9, 'pm2.5_cf_1': 4997.0, 'pm10.0_cf_1': 4997.0, 'score': 0.9333333333333333}
{'sensor_index': 20423.0, 'date_created': Timestamp('2018-11-30 14:17:23+0000', tz='UTC'), 'rssi': -48.0, 'uptime': 3630.0, 'latitude': 37.826344, 'longitude': -120.732544, 'humidity': 31.0, 'temp