# Initiate spark

In [1]:
import os

from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, max, to_date

from delta.tables import DeltaTable

conf = (
    SparkConf()
    .setAppName("Spark minIO Test")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.memory", "8g")
    .set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") 
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
)
sc = SparkContext(conf=conf).getOrCreate()
#sqlContext = SQLContext(sc)
spark = SparkSession(sc).builder.getOrCreate()

21/08/25 20:33:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/08/25 20:33:19 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Read data from the bronze table

In [7]:
# Weather stations is a small dimension, so we will just regenerate is every time
df = spark.read.format('delta').load('s3a://bronze-knmi/weather_stations')
df.show(5)

+--------------------+---------+--------+--------+---------------+--------------------+
|weather_station_code|longitude|latitude|altitude|weather_station|       load_datetime|
+--------------------+---------+--------+--------+---------------+--------------------+
|                 283|    6.657|  52.069|   29.10|         Hupsel|2021-08-24 19:47:...|
|                 340|    4.342|  51.449|   19.20|    Woensdrecht|2021-08-24 19:47:...|
|                 277|    6.200|  53.413|    2.90|     Lauwersoog|2021-08-24 19:47:...|
|                 375|    5.707|  51.659|   22.00|         Volkel|2021-08-24 19:47:...|
|                 312|    3.622|  51.768|    0.00|  Oosterschelde|2021-08-24 19:47:...|
+--------------------+---------+--------+--------+---------------+--------------------+
only showing top 5 rows



### Transform and rename the columns

In [9]:
df.createOrReplaceTempView("bronze_weather_stations")
silver_df = spark.sql("""
                select weather_station_code
                     , cast(longitude as DECIMAL(11,8)) as longitude
                     , cast(latitude as DECIMAL(11,8)) as latitude
                     , cast(altitude as DECIMAL(11,8)) as altitude
                     , weather_station
                     , to_date(load_datetime) as valid_from
                     , coalesce(lead(to_date(load_datetime)) OVER (PARTITION BY weather_station_code ORDER BY load_datetime), to_date('9999-12-31')) as valid_to
                from bronze_weather_stations
                order by weather_station_code, valid_from
            """).coalesce(1)
silver_df.show()

+--------------------+----------+-----------+-----------+------------------+----------+----------+
|weather_station_code| longitude|   latitude|   altitude|   weather_station|valid_from|  valid_to|
+--------------------+----------+-----------+-----------+------------------+----------+----------+
|                 209|4.51800000|52.46500000|       0E-8|            IJmond|2021-08-24|9999-12-31|
|                 210|4.43000000|52.17100000|-0.20000000|     Valkenburg Zh|2021-08-24|9999-12-31|
|                 215|4.43700000|52.14100000|-1.10000000|       Voorschoten|2021-08-24|9999-12-31|
|                 225|4.55500000|52.46300000| 4.40000000|          IJmuiden|2021-08-24|9999-12-31|
|                 235|4.78100000|52.92800000| 1.20000000|           De Kooy|2021-08-24|9999-12-31|
|                 240|4.79000000|52.31800000|-3.30000000|          Schiphol|2021-08-24|9999-12-31|
|                 242|4.92100000|53.24100000|10.80000000|          Vlieland|2021-08-24|9999-12-31|
|         

In [10]:
spark.sql("""
create table if not exists silver_knmi_daggegevens
using delta 
location 's3a://silver-knmi/daggegevens'
""")

++
||
++
++



In [10]:
spark.sql("""
DESCRIBE silver_knmi_daggegevens
""").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|weather_station_code|   string|       |
|                date|     date|       |
|vector_mean_wind_...|      int|       |
|vector_mean_winds...|   double|       |
|daily_mean_windsp...|   double|       |
|max_windspeed_in_...|   double|       |
|max_windspeed_hou...|   string|       |
|min_windspeed_in_...|   double|       |
|min_windspeed_hou...|   string|       |
|max_windgust_in_m...|   double|       |
|max_windgust_hour...|   string|       |
|daily_mean_temper...|   double|       |
| minimum_temperature|   double|       |
|minimum_temperatu...|   string|       |
| maximum_temperature|   double|       |
|maximum_temperatu...|   string|       |
|max_temp_at_10cm_...|   double|       |
|max_temp_at_10cm_...|   string|       |
|sunshine_duration...|   double|       |
|percentage_of_max...|   string|       |
+--------------------+---------+-------+
only showing top

### Write to the silver table

In [11]:
if not DeltaTable.isDeltaTable(spark, 's3a://silver-knmi/weather_stations'):
    print("Not a delta table, write the full df")
    silver_df.write.format("delta").save('s3a://silver-knmi/weather_stations')
else:
    print("Merging the data")
    silver_knmi_daggegevens_table = DeltaTable.forPath(spark, 's3a://silver-knmi/weather_stations')
    
    silver_knmi_daggegevens_table.alias("current_data") \
      .merge(
        silver_df.alias("new_data"),
        """
        current_data.weather_station_code = new_data.weather_station_code
        and
        current_data.date = new_data.date
        """).whenNotMatchedInsertAll().execute()

Merging the data


NameError: name 'silver_df' is not defined