
# Latest API data
This notebook consists of 3 parts:
1. Get new data from the API
2. Data preprocessing and feature engineering
3. Creating or backfilling the feature group

In [1]:
# Import standard Python libraries
import pandas as pd 
import hopsworks 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# Import machine learning tools
from sklearn.preprocessing import StandardScaler  
from sklearn.cluster import KMeans  
from sklearn.metrics import silhouette_score  

# Import other useful libraries
import uuid  # Unique identifier generation
import requests  # For making API requests
import json  
import io 
import os
import base64 
from datetime import datetime, timedelta  # Date/time handling and manipulation
import pytz  # Timezone conversions and support

import openmeteo_requests
import requests_cache
from retry_requests import retry

# Environment variable management
from dotenv import load_dotenv
load_dotenv()

True

## 1. Get new data from the API

### Sensor Data Access

Here is the information given by the company so we can acces thir data.

- GET request to `data.sensade.com` 
- Authentication: `Basic Auth` (user: miknie20@student.aau.dk, password: GitHub Secret)

### Sensors

Two sensors installed:

- `0080E115003BEA91` (Hw2.0 Fw2.0) Installed towards building

- `0080E115003E3597` (Hw2.0 Fw2.0) Installed towards bike lane

In [2]:
# getting the time for now
now = datetime.now()  # Get current time 
today = now 
yesterday = today - timedelta(days=1)
tomorrow = today + timedelta(days=1)
print(today)

2024-05-27 07:23:25.448569


In [3]:
# Format 'today', 'tomorrow', and 'yesterday' as "YYYY-MM-DD"
formatted_today = today.strftime('%Y-%m-%d %H:%M:%S')
formatted_tomorrow = tomorrow.strftime('%Y-%m-%d %H:%M:%S')
formatted_yesterday = yesterday.strftime('%Y-%m-%d %H:%M:%S')

In [4]:
# Defining API information
dev_eui_building = "0080E115003BEA91"
dev_eui_bikelane = "0080E115003E3597"
url = "https://data.sensade.com"

basic_auth = base64.b64encode(f"{os.getenv('API_USERNAME')}:{os.getenv('API_PASSWORD')}".encode())
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Basic {basic_auth.decode("utf-8")}'
}

In [5]:
# Function to ping the API and get data in a given time interval
def API_call(dev_eui, from_date, to_date):
    payload = json.dumps({
    "dev_eui": dev_eui,
    "from": from_date,
    "to": to_date
})

    API_response = requests.request("GET", url, headers=headers, data=payload)

    if API_response.status_code != 200:
        exit(13)

    csv_data = API_response.text
    df = pd.read_csv(io.StringIO(csv_data))
    return df

In [6]:
# Running the API call function with the given parameters on the building sensor
df_building_from_api = API_call(dev_eui_building, formatted_yesterday, formatted_tomorrow)

In [7]:
# Running the API call function with the given parameters on the bikelane sensor
df_bikelane_from_api = API_call(dev_eui_bikelane, formatted_yesterday, formatted_tomorrow)

In [8]:
# Defning the newest data from the API calls
df_building_newest = df_building_from_api.tail(1)
df_bikelane_newest = df_bikelane_from_api.tail(1)

In [9]:
df_building_newest

Unnamed: 0,time,battery,temperature,x,y,z,0_radar,1_radar,2_radar,3_radar,4_radar,5_radar,6_radar,7_radar,package_type,f_cnt,dr,snr,rssi,hw_fw_version
139,2024-05-27 07:10:51.256000,3.24,18.625,360,241,97,5.0,5.0,5.0,5.0,5.0,6.0,6.0,4.0,PackageType.HEART_BEAT,6023,5,-5.5,-89,DataVersion.HW_2FW2_X_X


In [10]:
df_bikelane_newest

Unnamed: 0,time,battery,temperature,x,y,z,0_radar,1_radar,2_radar,3_radar,4_radar,5_radar,6_radar,7_radar,package_type,f_cnt,dr,snr,rssi,hw_fw_version
139,2024-05-27 07:05:55.362000,3.1,16.75,-239,126,-409,44.0,68.0,64.0,41.0,35.0,17.0,11.0,11.0,PackageType.HEART_BEAT,5886,5,-5.0,-89,DataVersion.HW_2FW2_X_X


## 2. Preprocessing and feature engineering

We apply the same methods as in notebook 1: creating unique IDs, converting the time column, converting radar names, changing data types to floats, and finally creating an empty column for the mag_cluster, as we haven't applied our models yet.

In [11]:
df_building = df_building_newest.copy()
df_bikelane = df_bikelane_newest.copy()

In [12]:
# Defining a function that tries to parse the datetime with microseconds first, and if it fails, parses it without microseconds
def parse_datetime(dt_str):
    try:
        return datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S.%f')
    except ValueError:
        return datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')

In [13]:
# Applying the function on the dataframes
df_building = df_building.copy()
df_building['time'] = df_building['time'].apply(parse_datetime)
df_bikelane = df_bikelane.copy()
df_bikelane['time'] = df_bikelane['time'].apply(parse_datetime)

In [14]:
#converting the time column to datetime
df_bikelane['time'] = pd.to_datetime(df_bikelane['time'])
df_building['time'] = pd.to_datetime(df_building['time'])

In [15]:
#create a column for the time in the format of "YYYY-MM-DD HH" to merge with weather data
df_bikelane['time_hour'] = df_bikelane['time'].dt.strftime('%Y-%m-%d %H')
df_building['time_hour'] = df_building['time'].dt.strftime('%Y-%m-%d %H')
# Converting the time_hour column to datetime
df_bikelane['time_hour'] = pd.to_datetime(df_bikelane['time_hour'])
df_building['time_hour'] = pd.to_datetime(df_building['time_hour'])

### Weather data column

In [16]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

In [17]:

weather_url = "https://api.open-meteo.com/v1/forecast"
weather_params = {
	"latitude": 57.01,
	"longitude": 9.99,
	"hourly": ["temperature_2m", "relative_humidity_2m", "precipitation", "surface_pressure", "cloud_cover", "et0_fao_evapotranspiration", "wind_speed_10m", "soil_temperature_0_to_7cm", "soil_moisture_0_to_7cm"],
	"forecast_days": 1
}
responses = openmeteo.weather_api(weather_url, params=weather_params)

In [18]:
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(2).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(3).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(4).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(5).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(6).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(7).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(8).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["precipitation"] = hourly_precipitation
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm

hourly_dataframe = pd.DataFrame(data = hourly_data)
hourly_dataframe.head()

Coordinates 57.01040267944336°N 9.992218017578125°E
Elevation 23.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m,relative_humidity_2m,precipitation,surface_pressure,cloud_cover,et0_fao_evapotranspiration,wind_speed_10m,soil_temperature_0_to_7cm,soil_moisture_0_to_7cm
0,2024-05-27 00:00:00+00:00,16.154501,98.0,0.0,1012.247803,66.0,0.0,7.2,,
1,2024-05-27 01:00:00+00:00,15.9545,97.0,0.0,1012.644897,98.0,0.0,8.64,,
2,2024-05-27 02:00:00+00:00,15.8545,98.0,0.0,1012.145325,81.0,0.0,14.04,,
3,2024-05-27 03:00:00+00:00,15.804501,98.0,0.0,1011.945129,88.0,0.0,12.599999,,
4,2024-05-27 04:00:00+00:00,15.7045,97.0,0.0,1011.545349,98.0,0.015639,12.24,,


In [19]:
#remove the timezone from the date column
hourly_dataframe['date'] = hourly_dataframe['date'].dt.tz_localize(None)
#Convert to datetime object
hourly_dataframe['date'] = pd.to_datetime(hourly_dataframe['date'])

# Merging weather data and sensor data

In [20]:
# Merging the weather data with the building sensor data
df_building = pd.merge(df_building, hourly_dataframe, left_on='time_hour', right_on='date', how='left')
# Merging the weather data with the bikelane sensor data
df_bikelane = pd.merge(df_bikelane, hourly_dataframe, left_on='time_hour', right_on='date', how='left')

In [21]:
df_building = df_building.drop(columns=['date'])
df_bikelane = df_bikelane.drop(columns=['date'])

## Feature Engineering

In [22]:
# Create a unique identifier for each row in the datasets
def create_id(df, dataset_name):
    # Assign the sensor prefix based on the dataset name
    if dataset_name == 'df_building':
        df['psensor'] = "BUILDING"
    elif dataset_name == 'df_bikelane':
        df['psensor'] = "BIKELANE"
    else:
        raise ValueError("Unknown dataset name provided")

    # Create a new column 'id' with a unique identifier for each row
    df['id'] = df['time'].astype(str) + '_' + df['psensor']

    return df

In [23]:
# Applying the function to the datasets
df_bikelane = create_id(df_bikelane, 'df_bikelane')
df_building = create_id(df_building, 'df_building')

In [24]:
#Renaming the radar columns to start with radar
df_bikelane = df_bikelane.rename(columns={'0_radar': 'radar_0', '1_radar': 'radar_1', '2_radar': 'radar_2', '3_radar': 'radar_3', '4_radar': 'radar_4', '5_radar': 'radar_5', '6_radar': 'radar_6', '7_radar': 'radar_7'})
df_building = df_building.rename(columns={'0_radar': 'radar_0', '1_radar': 'radar_1', '2_radar': 'radar_2', '3_radar': 'radar_3', '4_radar': 'radar_4', '5_radar': 'radar_5', '6_radar': 'radar_6', '7_radar': 'radar_7'})

In [25]:
# Converting the columns to float
df_bikelane[['x','y','z', 'radar_0', 'radar_1', 'radar_2', 'radar_3', 'radar_4', 'radar_5', 'radar_6', 'radar_7', 'f_cnt', 'dr', 'rssi']] = df_bikelane[['x','y','z', 'radar_0', 'radar_1', 'radar_2', 'radar_3', 'radar_4', 'radar_5', 'radar_6', 'radar_7', 'f_cnt', 'dr', 'rssi']].astype(float)
df_building[['x','y','z', 'radar_0', 'radar_1', 'radar_2', 'radar_3', 'radar_4', 'radar_5', 'radar_6', 'radar_7', 'f_cnt', 'dr', 'rssi']] = df_building[['x','y','z', 'radar_0', 'radar_1', 'radar_2', 'radar_3', 'radar_4', 'radar_5', 'radar_6', 'radar_7', 'f_cnt', 'dr', 'rssi']].astype(float)


In [26]:
#making an empty label column
df_bikelane['radar_cluster'] = "null"
df_building['radar_cluster'] = "null"
df_bikelane['mag_cluster'] = "null"
df_building['mag_cluster'] = "null"

In [28]:
#Adding two hours to datetime to match the timezone
df_bikelane['time'] = df_bikelane['time'] + pd.Timedelta(hours=2)
df_building['time'] = df_building['time'] + pd.Timedelta(hours=2)

## Uploading latest data to Hopsworks

In [29]:
# Connceting to the Hopsworks project

project = hopsworks.login(project="annikaij")

fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549019
Connected. Call `.close()` to terminate connection gracefully.


In [30]:
bikelane_fg = fs.get_or_create_feature_group(name="new_bikelane_fg",
                                  version=1,
                                  primary_key=["id"],
                                  event_time='time',
                                  description="New bike lane data",
                                  online_enabled=True,
                                 )
bikelane_fg.insert(df_bikelane)

Uploading Dataframe: 0.00% |          | Rows 0/1 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: new_bikelane_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/549019/jobs/named/new_bikelane_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x718517766d40>, None)

In [31]:
building_fg = fs.get_or_create_feature_group(name="new_building_fg",
                                    version=1,
                                    primary_key=["id"],
                                    event_time='time',
                                    description="New building data",
                                    online_enabled=True
                                     )
building_fg.insert(df_building)

Uploading Dataframe: 0.00% |          | Rows 0/1 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: new_building_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/549019/jobs/named/new_building_fg_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7185177526e0>, None)

## **Next up:** 3: Feature view creation
Go to the 3_featureview_creation.ipynb notebook