In [0]:
import requests
import pandas as pd

import json

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler

from pyspark.sql.utils import AnalysisException



## Selected features from Open Meteo (from EDA)
Features from Open Meteo were explored in the EDA notebook.

In [0]:
# Optimal Features for clustering - from EDA
feature_cols = [
    "temperature_2m", "precipitation", "wind_gusts_10m", "cloud_cover"#, "pressure_msl", "visibility" # removed due to high interference with clustering
]

## Open-Meteo request for historical data
Loading historical forecast data, basically represents the training data.

In [0]:
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
params = {
    # Lisbon coordinates
	"latitude": 38.716885,
	"longitude": -9.140233,
	# 1 Year Historical data
	"start_date": "2024-06-01",
	"end_date": "2025-06-01",
	"hourly": feature_cols
}

response = requests.get(url, params=params)
data = response.json()
df = pd.DataFrame(data['hourly'])

In [0]:
df.dtypes

Out[4]: time               object
temperature_2m    float64
precipitation     float64
wind_gusts_10m    float64
cloud_cover         int64
dtype: object

In [0]:
# Create Spark dataframe for all further operations
sdf = spark.createDataFrame(df)

# Cast columns to types
sdf = sdf.select(
    col("time").cast(StringType()),
    col("temperature_2m").cast(DoubleType()),
    col("precipitation").cast(DoubleType()),
    col("wind_gusts_10m").cast(DoubleType()),
    col("cloud_cover").cast(DoubleType()),
    # col("pressure_msl").cast(DoubleType()), # removed due to high interference with clustering
    # col("visibility").cast(DoubleType()), # removed due to high interference with clustering
)

sdf.printSchema()
display(sdf.limit(10))

root
 |-- time: string (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- wind_gusts_10m: double (nullable = true)
 |-- cloud_cover: double (nullable = true)



time,temperature_2m,precipitation,wind_gusts_10m,cloud_cover
2024-06-01T00:00,23.4,0.0,8.3,0.0
2024-06-01T01:00,21.9,0.0,7.9,0.0
2024-06-01T02:00,20.9,0.0,7.2,0.0
2024-06-01T03:00,20.9,0.0,4.7,0.0
2024-06-01T04:00,20.8,0.0,3.2,1.0
2024-06-01T05:00,20.3,0.0,4.3,11.0
2024-06-01T06:00,19.5,0.0,4.3,1.0
2024-06-01T07:00,20.6,0.0,6.8,0.0
2024-06-01T08:00,22.2,0.0,13.7,0.0
2024-06-01T09:00,24.2,0.0,15.8,0.0


## Assembly and Scaling
For all model trainings to use the same & not having to re-execute this.

In [0]:
# Combine different feature columns into one single column (adds a "features" column of type vector)
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_sdf = assembler.transform(sdf)

# Define scaler
scaler = StandardScaler(
    inputCol="features", # assembled vector
    outputCol="features_scaled", # standardized vector
    withMean=True, # centered around 0 (subtract mean)
    withStd=True # std dev 1
)

scaler_model = scaler.fit(assembled_sdf)
scaled_sdf = scaler_model.transform(assembled_sdf)

# Save scaler model for usage in streaming
scaler_model.write().overwrite().save("/dbfs/tmp/scaler_model")

scaled_sdf.select("features", "features_scaled").show(truncate=False) # Check

+-------------------+---------------------------------------------------------------------------------+
|features           |features_scaled                                                                  |
+-------------------+---------------------------------------------------------------------------------+
|[23.4,0.0,8.3,0.0] |[1.1438667750464577,-0.2045818640908716,-1.1156504701003993,-1.3754718072117746] |
|[21.9,0.0,7.9,0.0] |[0.8548976623774065,-0.2045818640908716,-1.1453730193555045,-1.3754718072117746] |
|[20.9,0.0,7.2,0.0] |[0.6622515872647056,-0.2045818640908716,-1.1973874805519384,-1.3754718072117746] |
|[20.9,0.0,4.7,0.0] |[0.6622515872647056,-0.2045818640908716,-1.3831534133963457,-1.3754718072117746] |
|[20.8,0.0,3.2,1.0] |[0.6429869797534359,-0.2045818640908716,-1.4946129731029898,-1.3498521693545613] |
|[20.3,0.0,4.3,11.0]|[0.5466639421970855,-0.2045818640908716,-1.4128759626514507,-1.0936557907824282] |
|[19.5,0.0,4.3,1.0] |[0.39254708210692457,-0.2045818640908716,-1

## Saving to DBFS as a parquet file
As we are saving the data which is going to have a fixed schema and we don't have a need for ACID transactions, we're using parquet and not a delta lake.

In [0]:
# Permanent table name
permanent_table_name = "historical_weather_hourly_lisbon"

# Try to delete table
try:
    dbutils.fs.rm("dbfs:/user/hive/warehouse/historical_weather_hourly_lisbon", recurse=True)
    print("Deleted previously saved table successfully.")
except Exception as e:
    print("Path may not exist or could not be deleted:", str(e))

# Save table as parquet
scaled_sdf.write.mode("overwrite").format("parquet").saveAsTable(permanent_table_name)

# Save used feature cols
dbutils.fs.put("dbfs:/tmp/feature_cols.json", json.dumps(feature_cols), overwrite=True)

Deleted previously saved table successfully.
Wrote 68 bytes.
Out[7]: True