# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="../../images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">- Part 02: Feature Pipeline</span>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/advanced_tutorials/air_quality/2_feature_pipeline.ipynb)


## 🗒️ This notebook is divided into the following sections:
1. Parse Data
2. Feature Group Insertion

### <span style='color:#ff5f27'> 📝 Imports

In [1]:
import datetime
import time
import requests
import pandas as pd
import json

from functions import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open('target_cities.json') as json_file:
    target_cities = json.load(json_file)

In [3]:
today = datetime.date.today()
end_day = today + datetime.timedelta(days=7)

end_day, str(end_day)

(datetime.date(2023, 5, 2), '2023-05-02')

In [4]:
today, str(today)

(datetime.date(2023, 4, 25), '2023-04-25')

---

## <span style='color:#ff5f27'> 🌫 Filling gaps in Air Quality data (PM2.5)</span>

### First time we will determine the 'last update date' using our backfill data
#### Next time we will use `feature view` method from Hopsworks Feature Store

### <span style='color:#ff5f27'>  🧙🏼‍♂️ Parsing PM2.5 data

In [6]:
start_of_cell = time.time()

df_aq_raw = pd.DataFrame()

for continent in target_cities:
    for city_name, coords in target_cities[continent].items():
        df_ = get_aqi_data_from_open_meteo(city_name=city_name,
                                           coordinates=coords,
                                           start_date=str(today),
                                           end_date=str(end_day))
        df_aq_raw = pd.concat([df_aq_raw, df_]).reset_index(drop=True)
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new PM2.5 data for ALL locations up to {str(today)}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

KeyError: 'hourly'

In [None]:
df_aq_raw

In [None]:
df_aq_update = df_aq_raw
df_aq_update

### <span style="color:#ff5f27;">🛠 Feature Engineering PM2.5</span>

In [None]:
df_aq_update['date'] = pd.to_datetime(df_aq_update['date'])

In [None]:
# df_aq_update = feature_engineer_aq(df_aq_update)
df_aq_update = df_aq_update.dropna()

In [None]:
df_aq_update.isna().sum().sum()

In [None]:
df_aq_update.shape

In [None]:
df_aq_update.columns

---

## <span style='color:#ff5f27'> 🌦 Filling gaps in Weather data</span>

In [None]:
today = datetime.date.today()
end_day = today + datetime.timedelta(days=7)

end_day, str(end_day)

### <span style='color:#ff5f27'>  🧙🏼‍♂️ Parsing Weather data

In [None]:
start_of_cell = time.time()

df_weather_update = pd.DataFrame()



for continent in target_cities:
    for city_name, coords in target_cities[continent].items():
        df_ = get_weather_data_from_open_meteo(city_name=city_name,
                                               coordinates=coords,
                                               start_date=str(today),
                                               end_date=str(end_day),
                                               forecast=True)
        df_weather_update = pd.concat([df_weather_update, df_]).reset_index(drop=True)
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new weather data for ALL cities up to {str(today)}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

In [None]:
df_weather_update

In [None]:
df_aq_update.date = pd.to_datetime(df_aq_update.date)
df_weather_update.date = pd.to_datetime(df_weather_update.date)

df_aq_update["unix_time"] = df_aq_update["date"].apply(convert_date_to_unix)
df_weather_update["unix_time"] = df_weather_update["date"].apply(convert_date_to_unix)

In [None]:
df_aq_update.date = df_aq_update.date.astype(str)
df_weather_update.date = df_weather_update.date.astype(str)

---

## <span style="color:#ff5f27;">⬆️ Uploading new data to the Feature Store</span>

### <span style="color:#ff5f27;"> 🔮 Connecting to Hopsworks Feature Store </span>

In [None]:
import hopsworks


project = hopsworks.login()
fs = project.get_feature_store() 

air_quality_fg = fs.get_or_create_feature_group(
    name = 'air_quality',
    version = 1
)
weather_fg = fs.get_or_create_feature_group(
    name = 'weather',
    version = 1
)

In [None]:
air_quality_fg.insert(df_aq_update, write_options={"wait_for_job": False})

In [None]:
weather_fg.insert(df_weather_update, write_options={"wait_for_job": False})