IN THIS WE WILL BACKFILL BOTH FLIGHT DATA AND WEATHER DATA
Arlanda latitude and longitude: 59.6519 , 17.92

In [1]:
import sys
from pathlib import Path
import os

root_dir = Path.cwd().parent   # ← one directory up
root_dir = str(root_dir)
print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")
    
# Set the environment variables from the file <root_dir>/.env
import config
if os.path.exists(f"{root_dir}/.env"):
    settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Added the following directory to the PYTHONPATH: /Users/annastegaras/Desktop/ID2223/project/ML_Scale_Project
HopsworksSettings initialized!


In [2]:
import hopsworks
# Check if HOPSWORKS_API_KEY env variable is set or if it is set in ~/.env
if settings.HOPSWORKS_API_KEY is not None:
    api_key = settings.HOPSWORKS_API_KEY.get_secret_value()
    os.environ['HOPSWORKS_API_KEY'] = api_key
project = hopsworks.login()
fs = project.get_feature_store() 

2026-01-05 19:13:12,013 INFO: Initializing external client
2026-01-05 19:13:12,014 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 19:13:13,839 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286325


FLIGHT DATA LOADING

In [3]:
import pandas as pd
csv_file="data/arlanda_flights_2020_2025.csv"
df = pd.read_csv(csv_file)

df['flight_date'] = pd.to_datetime(
    df['flight_date'],
    format='%Y-%m-%d %H:%M:%S.%f %Z',
    utc = True 
)

df['flight_date'] = (
    df['flight_date']
      .dt.tz_convert('UTC')
      .dt.normalize()
      .dt.tz_localize(None)   # <-- remove timezone entirely
)
df = df.rename(columns={'flight_date': 'date'})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2186 entries, 0 to 2185
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            2186 non-null   datetime64[ns]
 1   total_landings  2186 non-null   int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 34.3 KB


In [5]:
import util
import datetime
today = datetime.date.today()

earliest_flight_date = pd.Series.min(df['date'])
earliest_flight_date = earliest_flight_date.strftime('%Y-%m-%d')
earliest_flight_date

city = "Märsta"
latitude = "59.6519"
longitude = "17.92"

weather_df = util.get_historical_weather(city, earliest_flight_date, str(today), latitude, longitude)

weather_df.tail()

Coordinates 59.64850616455078°N 17.958715438842773°E
Elevation 28.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,temperature_2m_mean,precipitation_sum,wind_speed_10m_max,wind_direction_10m_dominant,city
2193,2026-01-01,-0.80625,8.900002,30.027107,127.54464,Märsta
2194,2026-01-02,-1.810417,7.299999,26.950151,30.843538,Märsta
2195,2026-01-03,-2.852083,5.5,29.582493,28.580275,Märsta
2196,2026-01-04,-5.272917,0.0,23.678722,29.966925,Märsta
2197,2026-01-05,-7.7875,0.8,17.534994,48.65551,Märsta


In [6]:
cutoff = pd.Timestamp('2025-12-27')
weather_df = weather_df[weather_df['date'] <= cutoff].copy()
weather_df.tail()
weather_df['date'] = pd.to_datetime(weather_df['date'], errors='coerce').dt.normalize()

In [7]:
missing_days = pd.to_datetime([
    '2023-12-02',
    '2023-12-03',
    '2023-12-04'
])

weather_df = weather_df[~weather_df['date'].isin(missing_days)].copy()

In [8]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2186 entries, 0 to 2188
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         2186 non-null   datetime64[ns]
 1   temperature_2m_mean          2186 non-null   float32       
 2   precipitation_sum            2186 non-null   float32       
 3   wind_speed_10m_max           2186 non-null   float32       
 4   wind_direction_10m_dominant  2186 non-null   float32       
 5   city                         2186 non-null   object        
dtypes: datetime64[ns](1), float32(4), object(1)
memory usage: 85.4+ KB


In [11]:
# Get or create feature group 
weather_fg = fs.get_or_create_feature_group(
    name='weather_flight',
    description='Weather characteristics of each day',
    version=1,
    primary_key=['city'],
    event_time="date",
) 

In [12]:
# Insert data
weather_fg.insert(weather_df, wait=True)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286325/fs/1265794/fg/1908083


Uploading Dataframe: 100.00% |█| Rows 2186/2186 | Elapsed Time: 00:01 | Remainin


Launching job: weather_flight_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286325/jobs/named/weather_flight_1_offline_fg_materialization/executions
2026-01-05 19:14:48,035 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-05 19:14:51,247 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-05 19:17:06,297 INFO: Waiting for log aggregation to finish.
2026-01-05 19:17:21,998 INFO: Execution finished successfully.


(Job('weather_flight_1_offline_fg_materialization', 'SPARK'), None)

In [13]:
weather_fg.update_feature_description("date", "Date of measurement of weather")
weather_fg.update_feature_description("city", "City where weather is measured/forecast for")
weather_fg.update_feature_description("temperature_2m_mean", "Temperature in Celsius")
weather_fg.update_feature_description("precipitation_sum", "Rain or snow in mm")
weather_fg.update_feature_description("wind_speed_10m_max", "Wind speed at 10m abouve ground")
weather_fg.update_feature_description("wind_direction_10m_dominant", "Dominant Wind direction over the day")

<hsfs.feature_group.FeatureGroup at 0x1a4dc40d0>