# Initiate spark

In [2]:
import os
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col

conf = (
    SparkConf()
    .setAppName("Spark minIO Test")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.memory", "8g")
    .set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") 
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
)
sc = SparkContext(conf=conf).getOrCreate()
#sqlContext = SQLContext(sc)
spark = SparkSession(sc).builder.getOrCreate()

21/08/22 21:32:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Determine the last load dat to silver

In [2]:
from pyspark.sql.functions import max
last_load_datetime = spark.read.format('delta').load('s3a://silver-knmi/daggegevens').select(max("load_datetime")).collect()[0]["max(load_datetime)"]
print(last_load_datetime)

21/08/22 21:00:56 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
[Stage 5:====>                                                    (1 + 12) / 13]

2021-08-20 07:55:59.056056


                                                                                

### Read data from the bronze table

In [3]:
df = spark.read.format('delta').load('s3a://bronze-knmi/daggegevens').filter(col("load_datetime") > last_load_datetime)
df.select("YYYYMMDD").distinct().show()

                                                                                

+--------+
|YYYYMMDD|
+--------+
+--------+



### Transform and rename the columns

In [4]:
from pyspark.sql.functions import to_date, expr
silver_df = df.select(col('STN').alias('weather_station_code'),
                      to_date(col('YYYYMMDD'), 'yyyyMMdd').alias('date'),
                      col('DDVEC').cast("integer").alias("vector_mean_wind_direction_in_degrees"),
                      expr('FHVEC / 10').alias("vector_mean_windspeed_in_meters_per_second"),
                      expr('FG / 10').alias("daily_mean_windspeed_in_meters_per_second"),
                      expr('FHX / 10').alias("max_windspeed_in_meters_per_second"),
                      col('FHXH').alias("max_windspeed_hour_of_day"),
                      expr('FHN / 10').alias("min_windspeed_in_meters_per_second"),
                      expr('FHNH').alias("min_windspeed_hour_of_day"),
                      expr('FXX / 10').alias("max_windgust_in_meters_per_second"),
                      col('FXXH').alias("max_windgust_hour_of_day"),
                      expr('TG / 10').alias("daily_mean_temperature"),
                      expr('TN / 10').alias("minimum_temperature"),
                      col('TNH').alias("minimum_temperature_hour_of_day"),
                      expr('TX / 10').alias("maximum_temperature"),
                      col('TXH').alias("maximum_temperature_hour_of_day"),
                      expr('T10N / 10').alias("max_temp_at_10cm_above_ground"),
                      col('T10NH').alias("max_temp_at_10cm_above_ground_hour_of_day"),
                      expr('SQ / 10').alias("sunshine_duration_in_hours"),
                      expr('SP').alias("percentage_of_max_sumshine_duration"),
                      expr('Q').alias("global_radiation_in_J_per_cm2"),
                      expr('DR / 10').alias("precipitation_duration_in_hours"),
                      expr('RH / 10').alias("daily_precipitaion_amount_in_mm"),
                      expr('RHX / 10').alias("max_hourly_precipitaion_amount_in_mm"),
                      col('RHXH').alias("max_hourly_precipitaion_hour_of_day"),
                      expr('PG / 10').alias("mean_sealevel_pressure_in_hPa"),
                      expr('PX / 10').alias("max_sealevel_pressure_in_hPa"),
                      col('PXH').alias("max_sealevel_pressure_hour_of_day"),
                      expr('PN / 10').alias("min_sealevel_pressure_in_hPa"),
                      col('PNH').alias("min_sealevel_pressure_hour_of_day"),
                      expr('VVN * 100').alias("minimum_visibility_in_m"),
                      col('VVNH').alias("minimum_visibility_hour_of_day"),
                      expr('VVX * 100 + 100').alias("maximum_visibility_in_m"),
                      col('VVXH').alias("maximum_visibility_hour_of_day"),
                      expr('NG').alias("mean_daily_cloud_cover_in_octants"),
                      expr('UG').alias("mean_humidity"),
                      expr('UX').alias("maximim_humidity"),
                      col('UXH').alias("maximum_humidity_hour_of_day"),
                      expr('UN / 10').alias("minimum_humidity"),
                      col('UNH').alias("minimum_humidity_hour_of_day"),
                      expr('EV24 / 10').alias("potential_evapotranspiration_in_mm"),
                      col("load_datetime")
            ).coalesce(16)


In [4]:
from delta.tables import DeltaTable
spark.sql("""
create table if not exists silver_knmi_daggegevens
using delta 
location 's3a://silver-knmi/daggegevens'
""").show()

21/08/22 21:36:04 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties

++
||
++
++



                                                                                

In [6]:
spark.sql("""
show tables
""").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|silver_knmi_dagge...|      false|
+--------+--------------------+-----------+



### Write to the silver table

In [14]:
if not DeltaTable.isDeltaTable(spark, 's3a://silver-knmi/daggegevens'):
    print("Not a delta table, write the full df")
    silver_df.write.format("delta").save('s3a://silver-knmi/daggegevens')
else:
    print("Merging the data")
    silver_knmi_daggegevens_table = DeltaTable.forPath(spark, 's3a://silver-knmi/daggegevens')
    
    silver_knmi_daggegevens_table.alias("current_data") \
      .merge(
        silver_df.alias("new_data"),
        """
        current_data.weather_station_code = new_data.weather_station_code
        and
        current_data.date = new_data.date
        """).whenNotMatchedInsertAll().execute()

Merging the data


[Stage 16:=====>                                              (144 + 16) / 1377]

KeyboardInterrupt: 

In [15]:
testdf = spark.read.format('delta').load('s3a://silver-knmi/daggegevens')



In [19]:
#testdf.filter("date = '2021-08-16'").select(col("date"), col("weather_station_code")).show()
testdf.createOrReplaceTempView('test')

In [None]:
spark.sql("select data, weather_s")