# Initiate spark

In [1]:
import os
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

conf = (
    SparkConf()
    .setAppName("knmi daily weather to bronze")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
    .set("spark.executor.memory", "8g")
    .set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") 
    .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
)
sc = SparkContext(conf=conf).getOrCreate()
#sqlContext = SQLContext(sc)
spark = SparkSession(sc).builder.getOrCreate()

22/12/20 21:20:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/20 21:20:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
schema = (StructType([
    StructField("STN", StringType(), True),
    StructField("YYYYMMDD", StringType(), True),
    StructField("DDVEC", StringType(), True),
    StructField("FHVEC", StringType(), True),
    StructField("FG", StringType(), True),
    StructField("FHX", StringType(), True),
    StructField("FHXH", StringType(), True),
    StructField("FHN", StringType(), True),
    StructField("FHNH", StringType(), True),
    StructField("FXX", StringType(), True),
    StructField("FXXH", StringType(), True),
    StructField("TG", StringType(), True),
    StructField("TN", StringType(), True),
    StructField("TNH", StringType(), True),
    StructField("TX", StringType(), True),
    StructField("TXH", StringType(), True),
    StructField("T10N", StringType(), True),
    StructField("T10NH", StringType(), True),
    StructField("SQ", StringType(), True),
    StructField("SP", StringType(), True),
    StructField("Q", StringType(), True),
    StructField("DR", StringType(), True),
    StructField("RH", StringType(), True),
    StructField("RHX", StringType(), True),
    StructField("RHXH", StringType(), True),
    StructField("PG", StringType(), True),
    StructField("PX", StringType(), True),
    StructField("PXH", StringType(), True),
    StructField("PN", StringType(), True),
    StructField("PNH", StringType(), True),
    StructField("VVN", StringType(), True),
    StructField("VVNH", StringType(), True),
    StructField("VVX", StringType(), True),
    StructField("VVXH", StringType(), True),
    StructField("NG", StringType(), True),
    StructField("UG", StringType(), True),
    StructField("UX", StringType(), True),
    StructField("UXH", StringType(), True),
    StructField("UN", StringType(), True),
    StructField("UNH", StringType(), True),
    StructField("EV24", StringType(), True)
])
)

In [19]:
df = spark.read.csv('s3a://landing-knmi/daggegevens/*', header=False, comment='#', schema=schema)



In [20]:
from pyspark.sql.functions import current_timestamp
df = df.withColumn('load_datetime', current_timestamp())
df.show(3)

+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------------------+
|  STN|YYYYMMDD|DDVEC|FHVEC|   FG|  FHX| FHXH|  FHN| FHNH|  FXX| FXXH|   TG|   TN|  TNH|   TX|  TXH| T10N|T10NH|   SQ|   SP|    Q|   DR|   RH|  RHX| RHXH|   PG|   PX|  PXH|   PN|  PNH|  VVN| VVNH|  VVX| VVXH|   NG|   UG|   UX|  UXH|   UN|  UNH| EV24|       load_datetime|
+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+--------------------+
|  209|20210807|  202|   74|   78|  120|    1|   60|   11|  150|    1|     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |

In [21]:
df.dropDuplicates().createOrReplaceTempView('daggegevens')

In [24]:
#df.dropDuplicates().coalesce(4).write.format("delta").mode("overwrite").option("mergeSchema", "true").save('s3a://bronze-knmi/daggegevens')
df_deduplicated = df.dropDuplicates()

In [7]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "s3a://bronze-knmi/daggegevens")

In [25]:
deltaTable.alias('knmi_bronze_daggegevens') \
  .merge(
    df_deduplicated.alias('new_data'),
    'knmi_bronze_daggegevens.STN = new_data.STN AND knmi_bronze_daggegevens.YYYYMMDD = new_data.YYYYMMDD'
  ) \
  .whenNotMatchedInsertAll() \
  .execute()

                                                                                

In [23]:
spark.sql('''

select STN, YYYYMMDD, load_datetime from daggegevens where YYYYMMDD = '20220101' order by STN

''').show(100)

[Stage 38:>                                                       (0 + 16) / 16][Stage 38:===>                                                    (1 + 15) / 16]

+-----+--------+--------------------+
|  STN|YYYYMMDD|       load_datetime|
+-----+--------+--------------------+
|  209|20220101|2022-12-20 22:03:...|
|  215|20220101|2022-12-20 22:03:...|
|  225|20220101|2022-12-20 22:03:...|
|  235|20220101|2022-12-20 22:03:...|
|  240|20220101|2022-12-20 22:03:...|
|  242|20220101|2022-12-20 22:03:...|
|  248|20220101|2022-12-20 22:03:...|
|  249|20220101|2022-12-20 22:03:...|
|  251|20220101|2022-12-20 22:03:...|
|  257|20220101|2022-12-20 22:03:...|
|  258|20220101|2022-12-20 22:03:...|
|  260|20220101|2022-12-20 22:03:...|
|  267|20220101|2022-12-20 22:03:...|
|  269|20220101|2022-12-20 22:03:...|
|  270|20220101|2022-12-20 22:03:...|
|  273|20220101|2022-12-20 22:03:...|
|  275|20220101|2022-12-20 22:03:...|
|  277|20220101|2022-12-20 22:03:...|
|  278|20220101|2022-12-20 22:03:...|
|  279|20220101|2022-12-20 22:03:...|
|  280|20220101|2022-12-20 22:03:...|
|  283|20220101|2022-12-20 22:03:...|
|  285|20220101|2022-12-20 22:03:...|
|  286|20220

                                                                                