In [0]:
import requests
import pandas as pd
import math

import json
from functools import reduce

# Matplotlib for visualization at the end
import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql.functions import col, when, sum, when,lit, udf

from pyspark.sql.types import StructType, StructField, StringType, DoubleType

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator



## Grouping the features

In [0]:
# Features

wind = ["wind_speed_10m", "wind_speed_120m","wind_direction_10m", "wind_direction_120m","wind_gusts_10m"]

cloud_cover = ["cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "visibility"]

# realtive_humidity_2m: is the ratio of the actual water vapor in the air to the maximum the air can hold at that temperature (in %)
# dew_point_2m: temperature at which air becomes saturated with moisture
# vapour_pressure_deficit: difference between the amount of moisture in the air and how much it can hold when saturated.
temperature = ["apparent_temperature", "temperature_2m", "temperature_120m","relative_humidity_2m", "dew_point_2m", "vapour_pressure_deficit"]

rain_snow = ["precipitation_probability", "precipitation", "rain", "showers", "snowfall", "snow_depth"]

pressure = ["surface_pressure", "pressure_msl"]

water_balance = ["evapotranspiration", "et0_fao_evapotranspiration"]


## Loading the data

In [0]:
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"

dfs={}

# create pandas dataframes with the different types of features
# after transform the pandas data frane into a spark data frame

features = {
    'wind': wind, 
    'cloud_cover': cloud_cover, 
    'temperature': temperature, 
    'rain_snow': rain_snow,
    'pressure': pressure, 
    'water_balance': water_balance
    }

for key, value in features.items():
	params = {
		# Lisbon coordinates
		"latitude": 38.716885,
		"longitude": -9.140233,
		# 1 Year Historical data
		"start_date": "2023-05-01",
		"end_date": "2025-05-01",
		"hourly": value
	}

	response = requests.get(url, params=params)
	data = response.json()
	pdf = pd.DataFrame(data['hourly'])
	dfs[key] = spark.createDataFrame(pdf)

In [0]:
# columns on the wind df as an example
dfs['wind'].columns


Out[4]: ['time',
 'wind_speed_10m',
 'wind_speed_120m',
 'wind_direction_10m',
 'wind_direction_120m',
 'wind_gusts_10m']

In [0]:
# list of dataframes extracted from the dictionary
dfs_list = list(dfs.values())

# merge all DataFrames on the 'time' column with an outer join, reduce is used to simplify code
# so that we dont have to do all the joins one by one
df_merged = reduce(lambda left, right: left.join(right, on="time", how="outer"), dfs_list)


## Check for missing values

In [0]:
for group_feat in dfs.keys(): # check missing values
    dfs[group_feat].select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in dfs[group_feat].columns]).show()


+----+--------------+---------------+------------------+-------------------+--------------+
|time|wind_speed_10m|wind_speed_120m|wind_direction_10m|wind_direction_120m|wind_gusts_10m|
+----+--------------+---------------+------------------+-------------------+--------------+
|   0|             0|              0|                 0|                  0|             0|
+----+--------------+---------------+------------------+-------------------+--------------+

+----+-----------+---------------+---------------+----------------+----------+
|time|cloud_cover|cloud_cover_low|cloud_cover_mid|cloud_cover_high|visibility|
+----+-----------+---------------+---------------+----------------+----------+
|   0|          0|              0|              0|               0|         0|
+----+-----------+---------------+---------------+----------------+----------+

+----+--------------------+--------------+----------------+--------------------+------------+-----------------------+
|time|apparent_temperatur

In [0]:
df_merged.columns

Out[33]: ['time',
 'wind_speed_10m',
 'wind_speed_120m',
 'wind_direction_10m',
 'wind_direction_120m',
 'wind_gusts_10m',
 'cloud_cover',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'visibility',
 'apparent_temperature',
 'temperature_2m',
 'temperature_120m',
 'relative_humidity_2m',
 'dew_point_2m',
 'vapour_pressure_deficit',
 'precipitation_probability',
 'precipitation',
 'rain',
 'showers',
 'snowfall',
 'snow_depth',
 'surface_pressure',
 'pressure_msl',
 'evapotranspiration',
 'et0_fao_evapotranspiration']

In [0]:
df_merged=df_merged.drop('precipitation_probability') # drop the only column with missing values (ot only has missing values)

## Use SQL to do queries

In [0]:
# regist the dataframe in a view to query with sql
df_merged.createOrReplaceTempView("df_merged")

In [0]:
%sql
SELECT *
FROM df_merged

time,wind_speed_10m,wind_speed_120m,wind_direction_10m,wind_direction_120m,wind_gusts_10m,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,visibility,apparent_temperature,temperature_2m,temperature_120m,relative_humidity_2m,dew_point_2m,vapour_pressure_deficit,precipitation,rain,showers,snowfall,snow_depth,surface_pressure,pressure_msl,evapotranspiration,et0_fao_evapotranspiration
2023-05-01T00:00,6.2,21.4,21,7,18.0,0,0,0,0,24140.0,16.3,16.6,16.2,73,11.8,0.51,0.0,0.0,0.0,0.0,0.0,1014.5,1018.3,0.01,0.01
2023-05-01T01:00,4.5,19.8,14,2,13.0,0,0,0,0,24140.0,16.3,16.5,16.4,73,11.6,0.51,0.0,0.0,0.0,0.0,0.0,1014.2,1018.0,0.01,0.0
2023-05-01T02:00,3.0,18.8,346,355,10.4,0,0,0,0,24140.0,16.1,16.3,16.7,69,10.6,0.58,0.0,0.0,0.0,0.0,0.0,1013.7,1017.5,0.01,0.0
2023-05-01T03:00,3.2,18.8,360,357,6.8,100,0,0,100,24140.0,15.7,16.1,16.7,67,10.0,0.61,0.0,0.0,0.0,0.0,0.0,1013.7,1017.5,0.01,0.0
2023-05-01T04:00,3.0,17.7,346,355,5.8,42,0,0,42,24140.0,15.6,16.1,16.7,66,9.7,0.62,0.0,0.0,0.0,0.0,0.0,1014.0,1017.8,0.01,0.0
2023-05-01T05:00,3.6,19.4,6,360,8.3,96,0,0,96,24140.0,15.4,16.0,16.6,66,9.7,0.62,0.0,0.0,0.0,0.0,0.0,1014.0,1017.8,0.01,0.0
2023-05-01T06:00,3.4,16.7,32,6,9.0,29,0,0,29,24140.0,15.3,15.8,16.6,68,9.9,0.58,0.0,0.0,0.0,0.0,0.0,1013.9,1017.7,0.01,0.03
2023-05-01T07:00,3.3,14.5,41,14,7.2,0,0,0,0,24140.0,16.4,16.6,16.6,70,11.1,0.57,0.0,0.0,0.0,0.0,0.0,1014.0,1017.8,0.01,0.05
2023-05-01T08:00,5.0,11.7,21,18,14.4,0,0,0,0,24140.0,18.5,19.0,16.9,59,10.8,0.9,0.0,0.0,0.0,0.0,0.0,1014.4,1018.2,0.04,0.17
2023-05-01T09:00,7.0,9.2,21,21,19.4,0,0,0,0,24140.0,20.7,21.5,18.7,50,10.6,1.28,0.0,0.0,0.0,0.0,0.0,1014.5,1018.3,0.09,0.32


In [0]:
%sql
-- see the correlations
SELECT 
corr(wind_speed_10m,
 wind_direction_10m) AS correlation_speed10_direction10,
corr(wind_speed_10m, wind_speed_120m) AS correlation_speed10_speed120,
corr(wind_direction_10m, wind_direction_120m) AS correlation_direction10_direction120,
corr(wind_speed_10m, wind_gusts_10m) AS correlation_speed10_gusts10,
corr(wind_direction_10m, wind_gusts_10m) AS correlation_direction10_gusts10
FROM df_merged;

correlation_speed10_direction10,correlation_speed10_speed120,correlation_direction10_direction120,correlation_speed10_gusts10,correlation_direction10_gusts10
0.2253539517260512,0.8335900366635401,0.8971379164623488,0.9654900879984376,0.2838115525230414


## Trying to split into months years and hours

In [0]:
from pyspark.sql.functions import to_timestamp

df_merged = df_merged.withColumn("time_ts", to_timestamp("time", "yyyy-MM-dd'T'HH:mm"))


In [0]:
from pyspark.sql.functions import year, month, hour

df_merged = df_merged.withColumn("year", year("time_ts")) \
       .withColumn("month", month("time_ts")) \
       .withColumn("hour", hour("time_ts"))


In [0]:
df_merged.columns

Out[52]: ['time',
 'wind_speed_10m',
 'wind_speed_120m',
 'wind_direction_10m',
 'wind_direction_120m',
 'wind_gusts_10m',
 'cloud_cover',
 'cloud_cover_low',
 'cloud_cover_mid',
 'cloud_cover_high',
 'visibility',
 'apparent_temperature',
 'temperature_2m',
 'temperature_120m',
 'relative_humidity_2m',
 'dew_point_2m',
 'vapour_pressure_deficit',
 'precipitation',
 'rain',
 'showers',
 'snowfall',
 'snow_depth',
 'surface_pressure',
 'pressure_msl',
 'evapotranspiration',
 'et0_fao_evapotranspiration',
 'time_ts',
 'year',
 'month',
 'hour']

In [0]:
df_merged.groupBy("year", "month", "hour").count().show()


+----+-----+----+-----+
|year|month|hour|count|
+----+-----+----+-----+
|2023|    5|  18|   31|
|2024|    3|  19|   31|
|2025|    3|  19|   31|
|2024|    2|  10|   29|
|2025|    3|   9|   31|
|2024|    1|  13|   31|
|2024|   11|  13|   30|
|2025|    2|  19|   28|
|2024|   11|  20|   30|
|2023|    9|   4|   30|
|2023|    8|  10|   31|
|2025|    2|  12|   28|
|2025|    5|   5|    1|
|2024|    3|  15|   31|
|2023|   12|   9|   31|
|2024|    7|   0|   31|
|2024|   11|   1|   30|
|2025|    3|  10|   31|
|2023|    7|   4|   31|
|2025|    1|   7|   31|
+----+-----+----+-----+
only showing top 20 rows

