# Initiate spark

In [18]:
import os
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession 
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

conf = (
    SparkConf()
    .setAppName("Spark minIO Test")
    .set("spark.hadoop.fs.s3a.endpoint", "http://192.168.86.192:9000")
    .set("spark.hadoop.fs.s3a.access.key", os.getenv('MINIO_ROOT_USER'))
    .set("spark.hadoop.fs.s3a.secret.key", os.getenv('MINIO_ROOT_PASSWORD'))
    .set("spark.hadoop.fs.s3a.path.style.access", True)
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.driver.memory", "8g")
)
#sc = SparkContext(conf=conf).getOrCreate()
#sqlContext = SQLContext(sc)
spark = SparkSession(sc).builder.getOrCreate()

In [19]:
schema = (StructType([
    StructField("STN", StringType(), True),
    StructField("YYYYMMDD", StringType(), True),
    StructField("DDVEC", StringType(), True),
    StructField("FHVEC", StringType(), True),
    StructField("FG", StringType(), True),
    StructField("FHX", StringType(), True),
    StructField("FHXH", StringType(), True),
    StructField("FHN", StringType(), True),
    StructField("FHNH", StringType(), True),
    StructField("FXX", StringType(), True),
    StructField("FXXH", StringType(), True),
    StructField("TG", StringType(), True),
    StructField("TN", StringType(), True),
    StructField("TNH", StringType(), True),
    StructField("TX", StringType(), True),
    StructField("TXH", StringType(), True),
    StructField("T10N", StringType(), True),
    StructField("T10NH", StringType(), True),
    StructField("SQ", StringType(), True),
    StructField("SP", StringType(), True),
    StructField("Q", StringType(), True),
    StructField("DR", StringType(), True),
    StructField("RH", StringType(), True),
    StructField("RHX", StringType(), True),
    StructField("RHXH", StringType(), True),
    StructField("PG", StringType(), True),
    StructField("PX", StringType(), True),
    StructField("PXH", StringType(), True),
    StructField("PN", StringType(), True),
    StructField("PNH", StringType(), True),
    StructField("VVN", StringType(), True),
    StructField("VVNH", StringType(), True),
    StructField("VVX", StringType(), True),
    StructField("VVXH", StringType(), True),
    StructField("NG", StringType(), True),
    StructField("UG", StringType(), True),
    StructField("UX", StringType(), True),
    StructField("UXH", StringType(), True),
    StructField("UN", StringType(), True),
    StructField("UNH", StringType(), True),
    StructField("EV24", StringType(), True)
])
)

In [20]:
print(os.getenv('MINIO_ROOT_USER'))

minio


In [34]:
df = spark.read.csv('s3a://landing-knmi/daggegevens/2021*.csv', header=False, comment='#', schema=schema)

In [35]:
df.show(10)

+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  STN|YYYYMMDD|DDVEC|FHVEC|   FG|  FHX| FHXH|  FHN| FHNH|  FXX| FXXH|   TG|   TN|  TNH|   TX|  TXH| T10N|T10NH|   SQ|   SP|    Q|   DR|   RH|  RHX| RHXH|   PG|   PX|  PXH|   PN|  PNH|  VVN| VVNH|  VVX| VVXH|   NG|   UG|   UX|  UXH|   UN|  UNH| EV24|
+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  209|20210105|   60|   84|   84|  100|    1|   70|   23|  140|    7|     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |   

In [36]:
df.createOrReplaceTempView('daggegevens')

In [37]:
query = """
    select left(YYYYMMDD,4), count(*)
    from daggegevens
    group by left(YYYYMMDD,4)
    order by left(YYYYMMDD,4) desc
    limit 10

"""

sqlContext.sql(query).show()

+-----------------+--------+
|left(YYYYMMDD, 4)|count(1)|
+-----------------+--------+
|             2021|   10481|
+-----------------+--------+



In [41]:
#df.coalesce(1).write.partitionBy("YYYYMMDD").mode("overwrite").format("parquet").save('s3a://test-bucket/test-knmi-dag-perdag2')
df.coalesce(1).write.partitionBy("YYYYMMDD").format("delta").mode("overwrite").save('s3a://test-delta/knmidelta')

                                                                                

In [10]:
df.show()

+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  STN|YYYYMMDD|DDVEC|FHVEC|   FG|  FHX| FHXH|  FHN| FHNH|  FXX| FXXH|   TG|   TN|  TNH|   TX|  TXH| T10N|T10NH|   SQ|   SP|    Q|   DR|   RH|  RHX| RHXH|   PG|   PX|  PXH|   PN|  PNH|  VVN| VVNH|  VVX| VVXH|   NG|   UG|   UX|  UXH|   UN|  UNH| EV24|
+-----+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|  209|20051206|  237|   34|   51|   80|   24|   30|   11|  110|   24|     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |     |   

In [25]:
from delta.tables import DeltaTable

In [26]:
test_knmi = DeltaTable.forPath(spark, 's3a://test-bucket/test-knmi-dag-delta')

In [28]:
test_knmi.create()

<delta.tables.DeltaTableBuilder at 0x7f4d10520790>

In [43]:
spark.sql(f"""
DROP TABLE IF EXISTS test_knmi
""")

spark.sql(f"""
CREATE TABLE test_knmi
USING DELTA
LOCATION "s3a://test-bucket/test-knmi-dag-delta"
""")

DataFrame[]

In [44]:
spark.sql("select YYYYMMDD, count(*) from test_knmi group by YYYYMMDD order by YYYYMMDD desc limit 10").show()

+--------+--------+
|YYYYMMDD|count(1)|
+--------+--------+
|20210807|      47|
|20210806|     141|
|20210805|      94|
|20210804|      94|
|20210803|      94|
|20210802|      94|
|20210801|      94|
|20210731|      94|
|20210730|      94|
|20210729|      94|
+--------+--------+

