In [2]:
import os
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col

conf = (
    SparkConf()
    .setAppName("Spark minIO Test")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.memory", "8g")
    .set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") 
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
)
sc = SparkContext(conf=conf).getOrCreate()
#sqlContext = SQLContext(sc)
spark = SparkSession(sc).builder.getOrCreate()

21/08/24 18:00:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/08/24 18:00:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/08/24 18:00:11 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [11]:
weather_stations_file = sc.textFile('s3a://landing-knmi/weather_stations/*')

In [12]:
weather_stations_file.take(50)

['# Opmerking: door stationsverplaatsingen en veranderingen in waarneemmethodieken zijn deze tijdreeksen van uurwaarden mogelijk inhomogeen! Dat betekent dat deze reeks van gemeten waarden niet geschikt is voor trendanalyse. Voor studies naar klimaatverandering verwijzen we naar de gehomogeniseerde dagreeksen <http://www.knmi.nl/nederland-nu/klimatologie/daggegevens> of de Centraal Nederland Temperatuur <http://www.knmi.nl/kennis-en-datacentrum/achtergrond/centraal-nederland-temperatuur-cnt>.',
 '# ',
 '# SOURCE: ROYAL NETHERLANDS METEOROLOGICAL INSTITUTE (KNMI)',
 '# Comment: These time series are inhomogeneous because of station relocations and changes in observation techniques. As a result these series are not suitable for trend analysis. For climate change studies we refer to the homogenized series of daily data <http://www.knmi.nl/nederland-nu/klimatologie/daggegevens> or the Central Netherlands Temperature <http://www.knmi.nl/kennis-en-datacentrum/achtergrond/centraal-nederland-t

In [104]:
import re
weather_stations = weather_stations_file.filter(lambda x: x.startswith('# STN         ') or re.match('# \d\d\d', x)).map(lambda x: x.replace('# ', ''))

In [105]:
from pyspark.sql.functions import trim
weather_stations.map(lambda x: x.strip()).take(5)

['STN         LON(east)   LAT(north)  ALT(m)      NAME',
 '209         4.518       52.465      0.00        IJmond',
 '210         4.430       52.171      -0.20       Valkenburg Zh',
 '215         4.437       52.141      -1.10       Voorschoten',
 '225         4.555       52.463      4.40        IJmuiden']

In [106]:
weather_stations = weather_stations.map(lambda x: re.sub('\s\s+', ';', x.strip()))

In [107]:
weather_stations.take(20)

['STN;LON(east);LAT(north);ALT(m);NAME',
 '209;4.518;52.465;0.00;IJmond',
 '210;4.430;52.171;-0.20;Valkenburg Zh',
 '215;4.437;52.141;-1.10;Voorschoten',
 '225;4.555;52.463;4.40;IJmuiden',
 '235;4.781;52.928;1.20;De Kooy',
 '240;4.790;52.318;-3.30;Schiphol',
 '242;4.921;53.241;10.80;Vlieland',
 '248;5.174;52.634;0.80;Wijdenes',
 '249;4.979;52.644;-2.40;Berkhout',
 '251;5.346;53.392;0.70;Hoorn Terschelling',
 '257;4.603;52.506;8.50;Wijk aan Zee',
 '258;5.401;52.649;7.30;Houtribdijk',
 '260;5.180;52.100;1.90;De Bilt',
 '265;5.274;52.130;13.90;Soesterberg',
 '267;5.384;52.898;-1.30;Stavoren',
 '269;5.520;52.458;-3.70;Lelystad',
 '270;5.752;53.224;1.20;Leeuwarden',
 '273;5.888;52.703;-3.30;Marknesse',
 '275;5.873;52.056;48.20;Deelen']

In [108]:
weather_stations.map(lambda x: x.split(';')).take(5)

[['STN', 'LON(east)', 'LAT(north)', 'ALT(m)', 'NAME'],
 ['209', '4.518', '52.465', '0.00', 'IJmond'],
 ['210', '4.430', '52.171', '-0.20', 'Valkenburg Zh'],
 ['215', '4.437', '52.141', '-1.10', 'Voorschoten'],
 ['225', '4.555', '52.463', '4.40', 'IJmuiden']]

In [103]:
weather_stations.map(lambda x: x.split(';')).toDF().show()

+---+-----+------+-----+------------------+
| _1|   _2|    _3|   _4|                _5|
+---+-----+------+-----+------------------+
|209|4.518|52.465| 0.00|            IJmond|
|210|4.430|52.171|-0.20|     Valkenburg Zh|
|215|4.437|52.141|-1.10|       Voorschoten|
|225|4.555|52.463| 4.40|          IJmuiden|
|235|4.781|52.928| 1.20|           De Kooy|
|240|4.790|52.318|-3.30|          Schiphol|
|242|4.921|53.241|10.80|          Vlieland|
|248|5.174|52.634| 0.80|          Wijdenes|
|249|4.979|52.644|-2.40|          Berkhout|
|251|5.346|53.392| 0.70|Hoorn Terschelling|
|257|4.603|52.506| 8.50|      Wijk aan Zee|
|258|5.401|52.649| 7.30|       Houtribdijk|
|260|5.180|52.100| 1.90|           De Bilt|
|265|5.274|52.130|13.90|       Soesterberg|
|267|5.384|52.898|-1.30|          Stavoren|
|269|5.520|52.458|-3.70|          Lelystad|
|270|5.752|53.224| 1.20|        Leeuwarden|
|273|5.888|52.703|-3.30|         Marknesse|
|275|5.873|52.056|48.20|            Deelen|
|277|6.200|53.413| 2.90|        

In [117]:
header = weather_stations.map(lambda x: x.split(';')).first()
header

['STN', 'LON(east)', 'LAT(north)', 'ALT(m)', 'NAME']

In [118]:
data = weather_stations.map(lambda x: x.split(';')).filter(lambda row : row != header)

In [121]:
df = data.toDF(header)
df.show()

+---+---------+----------+------+------------------+
|STN|LON(east)|LAT(north)|ALT(m)|              NAME|
+---+---------+----------+------+------------------+
|209|    4.518|    52.465|  0.00|            IJmond|
|210|    4.430|    52.171| -0.20|     Valkenburg Zh|
|215|    4.437|    52.141| -1.10|       Voorschoten|
|225|    4.555|    52.463|  4.40|          IJmuiden|
|235|    4.781|    52.928|  1.20|           De Kooy|
|240|    4.790|    52.318| -3.30|          Schiphol|
|242|    4.921|    53.241| 10.80|          Vlieland|
|248|    5.174|    52.634|  0.80|          Wijdenes|
|249|    4.979|    52.644| -2.40|          Berkhout|
|251|    5.346|    53.392|  0.70|Hoorn Terschelling|
|257|    4.603|    52.506|  8.50|      Wijk aan Zee|
|258|    5.401|    52.649|  7.30|       Houtribdijk|
|260|    5.180|    52.100|  1.90|           De Bilt|
|265|    5.274|    52.130| 13.90|       Soesterberg|
|267|    5.384|    52.898| -1.30|          Stavoren|
|269|    5.520|    52.458| -3.70|          Lel

In [123]:
weather_stations_df = df.select(col('STN').alias('weather_station_code')
                               ,col('LON(east)').alias('longitude')
                               ,col('LAT(north)').alias('latitude')
                               ,col('ALT(m)').alias('altitude')
                               ,col('NAME').alias('weather_station'))
weather_stations_df.show()

+--------------------+---------+--------+--------+------------------+
|weather_station_code|longitude|latitude|altitude|   weather_station|
+--------------------+---------+--------+--------+------------------+
|                 209|    4.518|  52.465|    0.00|            IJmond|
|                 210|    4.430|  52.171|   -0.20|     Valkenburg Zh|
|                 215|    4.437|  52.141|   -1.10|       Voorschoten|
|                 225|    4.555|  52.463|    4.40|          IJmuiden|
|                 235|    4.781|  52.928|    1.20|           De Kooy|
|                 240|    4.790|  52.318|   -3.30|          Schiphol|
|                 242|    4.921|  53.241|   10.80|          Vlieland|
|                 248|    5.174|  52.634|    0.80|          Wijdenes|
|                 249|    4.979|  52.644|   -2.40|          Berkhout|
|                 251|    5.346|  53.392|    0.70|Hoorn Terschelling|
|                 257|    4.603|  52.506|    8.50|      Wijk aan Zee|
|                 25

In [124]:
from pyspark.sql.functions import current_timestamp
weather_stations_df = weather_stations_df.withColumn('load_datetime', current_timestamp())

In [125]:
weather_stations_df.show()

+--------------------+---------+--------+--------+------------------+--------------------+
|weather_station_code|longitude|latitude|altitude|   weather_station|       load_datetime|
+--------------------+---------+--------+--------+------------------+--------------------+
|                 209|    4.518|  52.465|    0.00|            IJmond|2021-08-24 19:46:...|
|                 210|    4.430|  52.171|   -0.20|     Valkenburg Zh|2021-08-24 19:46:...|
|                 215|    4.437|  52.141|   -1.10|       Voorschoten|2021-08-24 19:46:...|
|                 225|    4.555|  52.463|    4.40|          IJmuiden|2021-08-24 19:46:...|
|                 235|    4.781|  52.928|    1.20|           De Kooy|2021-08-24 19:46:...|
|                 240|    4.790|  52.318|   -3.30|          Schiphol|2021-08-24 19:46:...|
|                 242|    4.921|  53.241|   10.80|          Vlieland|2021-08-24 19:46:...|
|                 248|    5.174|  52.634|    0.80|          Wijdenes|2021-08-24 19:46:...|

In [128]:
from delta.tables import DeltaTable
if not DeltaTable.isDeltaTable(spark, 's3a://bronze-knmi/weather_stations'):
    print("Not a delta table, write the full df")
    weather_stations_df.dropDuplicates().coalesce(1).write.format("delta").mode("overwrite").option("mergeSchema", "true").save('s3a://bronze-knmi/weather_stations')
    spark.sql("""
        create table if not exists bronze_knmi_weather_stations
        using delta 
        location 's3a://bronze-knmi/weather_stations'
        """)
else:
    pass

In [131]:
spark.sql("""
select * from bronze_knmi_weather_stations limit 10
""").show()

+--------------------+---------+--------+--------+-------------------+--------------------+
|weather_station_code|longitude|latitude|altitude|    weather_station|       load_datetime|
+--------------------+---------+--------+--------+-------------------+--------------------+
|                 283|    6.657|  52.069|   29.10|             Hupsel|2021-08-24 19:47:...|
|                 340|    4.342|  51.449|   19.20|        Woensdrecht|2021-08-24 19:47:...|
|                 277|    6.200|  53.413|    2.90|         Lauwersoog|2021-08-24 19:47:...|
|                 375|    5.707|  51.659|   22.00|             Volkel|2021-08-24 19:47:...|
|                 312|    3.622|  51.768|    0.00|      Oosterschelde|2021-08-24 19:47:...|
|                 273|    5.888|  52.703|   -3.30|          Marknesse|2021-08-24 19:47:...|
|                 348|    4.926|  51.970|   -0.70|        Cabauw Mast|2021-08-24 19:47:...|
|                 343|    4.313|  51.893|    3.50|Rotterdam Geulhaven|2021-08-24