In [2]:
%matplotlib inline

import findspark
findspark.init()

import numpy as np
import builtins
import math
import datetime as dt
import holidays
import geohash
import os.path as osp
import matplotlib.pylab as plt
import seaborn as sns

from pyspark import keyword_only
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
sess = SparkSession.builder \
                   .master("local[*]") \
                   .config("spark.driver.memory", "32g") \
                   .getOrCreate()

In [4]:
df = sess.read.parquet('./stgt_sensors_with_date_geo_dwd.parquet')
df.printSchema()
print(df.count())

root
 |-- sensor_id: integer (nullable = true)
 |-- sensor_type: string (nullable = true)
 |-- location: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- P1: double (nullable = true)
 |-- durP1: double (nullable = true)
 |-- ratioP1: double (nullable = true)
 |-- P2: double (nullable = true)
 |-- durP2: double (nullable = true)
 |-- ratioP2: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- altitude: double (nullable = true)
 |-- pressure_sealevel: double (nullable = true)
 |-- datedim: struct (nullable = true)
 |    |-- year: integer (nullable = true)
 |    |-- month: integer (nullable = true)
 |    |-- day: integer (nullable = true)
 |    |-- day_of_week: integer (nullable = true)
 |    |-- weekend: integer (nullable = true)
 |    |-- holiday: integer (nullable = true)
 |    |-- day_cat: 

In [7]:
df.where("sensor_type = 'SDS011'") \
  .where("P1 is not null") \
  .count()

199583741

In [10]:
df.where("sensor_type != 'PPD42NS'") \
  .groupBy("geohash.hash", 
           "datedim.year", "datedim.month", "datedim.day", "datedim.time_bin") \
  .agg(max("datedim.weekend").alias("weekend"), 
       max("datedim.holiday").alias("holiday"),
       max("datedim.day_of_week").alias("day_of_week"),
       avg("datedim.day_num").alias("day_num"), \
       avg("datedim.time_num").alias("time_num"), \
       avg("temperature").alias("temperature"), 
       avg("dwddim.TT_TU").alias("dwd_temperature"),
       avg("humidity").alias("humidity"), 
       avg("dwddim.R1").alias("dwd_precipitation_height"),
       max("dwddim.RS_IND").alias("dwd_precipitation_indicator"),
       avg("pressure").alias("pressure"), 
       avg("dwddim.F").alias("dwd_wind_speed"),
       avg("dwddim.D").alias("dwd_wind_direction"),
       avg("dwddim.V_N").alias("dwd_clouds"),
       avg("dwddim.SD_SO").alias("dwd_sun"),
       avg("P1").alias("P1"), 
       avg("durP1").alias("durP1"), 
       avg("ratioP1").alias("ratioP1"), 
       avg("P2").alias("P2"), 
       avg("durP2").alias("durP2"), 
       avg("ratioP2").alias("ratioP2")) \
  .write.mode("overwrite").parquet("./stgt_sensors_with_date_geo_dwd_aggregated.parquet")

In [11]:
agg_df = sess.read.parquet("./stgt_sensors_with_date_geo_dwd_aggregated.parquet/")
agg_df.printSchema()
print(agg_df.count())

root
 |-- hash: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- time_bin: integer (nullable = true)
 |-- weekend: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- day_num: double (nullable = true)
 |-- time_num: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- dwd_temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- dwd_precipitation_height: double (nullable = true)
 |-- dwd_precipitation_indicator: long (nullable = true)
 |-- pressure: double (nullable = true)
 |-- dwd_wind_speed: double (nullable = true)
 |-- dwd_wind_direction: double (nullable = true)
 |-- dwd_clouds: double (nullable = true)
 |-- dwd_sun: double (nullable = true)
 |-- P1: double (nullable = true)
 |-- durP1: double (nullable = true)
 |-- ratioP1: double (nullable = true)
 |-- P2: double (nullable = true)


In [13]:
agg_df.na.fill(np.nan).limit(10).toPandas()

Unnamed: 0,hash,year,month,day,time_bin,weekend,holiday,day_of_week,day_num,time_num,...,dwd_wind_speed,dwd_wind_direction,dwd_clouds,dwd_sun,P1,durP1,ratioP1,P2,durP2,ratioP2
0,u0wmr3z80y9e,2017,10,23,91,0,0,0,0.136161,0.953125,...,1.5,230.0,7.0,,9.888333,,,8.390556,,
1,u0wmr3z80y9e,2017,10,5,72,0,0,3,0.536458,0.755208,...,3.8,240.0,8.0,0.0,10.201333,,,7.841,,
2,u0wmr3z80y9e,2017,10,5,87,0,0,3,0.55878,0.911458,...,3.5,280.0,4.0,,69.945165,,,40.930549,,
3,u0wmr3z80y9e,2017,10,16,35,0,0,0,0.052827,0.369792,...,0.4,100.0,0.0,60.0,17.678571,,,11.736154,,
4,u0wmr3z80y9e,2017,10,26,84,0,0,3,0.554315,0.880208,...,0.9,230.0,7.0,,19.349474,,,13.293509,,
5,u0wmr3z80y9e,2017,10,30,90,0,0,0,0.134673,0.942708,...,1.5,210.0,2.0,,92.963956,,,48.198791,,
6,u0wmr3z80y9e,2017,12,7,24,0,0,3,0.46503,0.255208,...,1.3,90.0,2.0,0.0,74.432198,,,39.344176,,
7,u0wmr3z80y9e,2017,11,5,27,1,0,6,0.898065,0.286458,...,1.0,130.0,8.0,0.0,100.775444,,,58.488222,,
8,u0wmr3z80y9e,2017,11,5,61,1,0,6,0.948661,0.640625,...,7.3,260.0,8.0,0.0,23.882198,,,15.01,,
9,u0wmr3z80y9e,2017,11,7,82,0,0,1,0.265625,0.859375,...,3.6,340.0,8.0,0.0,36.61956,,,23.053626,,


In [14]:
agg_df.na.fill(np.nan) \
      .orderBy("hash", "year", "month", "day", "time_bin") \
      .groupBy("hash") \
      .agg(min("year").alias("start_year"),
           collect_list("weekend").alias("weekend"), 
           collect_list("holiday").alias("holiday"),
           collect_list("day_of_week").alias("day_of_week"),
           collect_list("day_num").alias("day_num"),
           collect_list("time_num").alias("time_num"),
           collect_list("temperature").alias("temperature"), 
           collect_list("dwd_temperature").alias("dwd_temperature"),
           collect_list("humidity").alias("humidity"),
           collect_list("dwd_precipitation_height").alias("dwd_precipitation_height"),
           collect_list("pressure").alias("pressure"),
           collect_list("dwd_wind_speed").alias("dwd_wind_speed"),
           collect_list("dwd_wind_direction").alias("dwd_wind_direction"),
           collect_list("dwd_clouds").alias("dwd_clouds"),
           collect_list("dwd_sun").alias("dwd_sun"),
           collect_list("P1").alias("P1"),
           collect_list("P2").alias("P2")) \
       .write.mode("overwrite").parquet("./stgt_sensors_with_date_geo_dwd_series.parquet/")

In [15]:
test_df = sess.read.parquet("./stgt_sensors_with_date_geo_dwd_series.parquet/")
test_df.printSchema()
print(test_df.count())

root
 |-- hash: string (nullable = true)
 |-- start_year: integer (nullable = true)
 |-- weekend: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- holiday: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- day_of_week: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- day_num: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- time_num: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- temperature: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_temperature: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- humidity: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_precipitation_height: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- pressure: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_wind_spe

In [16]:
test_df = test_df.withColumn("N_P1", size("P1")) \
                 .withColumn("N_temp", size("temperature")) \
                 .withColumn("N_dwd_temperature", size("dwd_temperature")) \
                 .where("N_P1 > %d" % 1e4)

In [19]:
pd_df = test_df.orderBy("start_year") \
               .limit(10) \
               .toPandas()
pd_df

Unnamed: 0,hash,start_year,weekend,holiday,day_of_week,day_num,time_num,temperature,dwd_temperature,humidity,...,pressure,dwd_wind_speed,dwd_wind_direction,dwd_clouds,dwd_sun,P1,P2,N_P1,N_temp,N_dwd_temperature
0,u0wt91nph3t9,2016,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, ...","[0.8370535714285708, 0.838541666666669, 0.8400...","[0.859375, 0.8697916666666695, 0.8802083333333...","[21.999999999999982, 21.59285714285708, 18.139...","[8.600000000000012, 8.59999999999994, 7.299999...","[55.80000000000003, 49.042857142857436, 50.839...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.7000000000000008, 0.6999999999999945, 1.199...","[160.0, 160.0, 150.0, 150.0, 150.0, 150.0, 170...","[0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, nan, nan, nan, nan, nan, nan, nan, ...","[21.20000000000002, 25.753333333333497, 35.071...","[8.635000000000003, 13.836666666666623, 18.938...",40597,40597,40597
1,u0wx71c4gb5s,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.6421130952380976, 0.6436011904761962, 0.645...","[0.49479166666666746, 0.505208333333335, 0.515...","[8.799999999999999, 7.487499999999991, 6.72499...","[6.299999999999985, 6.900000000000047, 6.90000...","[61.3999999999999, 67.31249999999994, 71.05000...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.5999999999999988, 0.6999999999999953, 0.699...","[250.0, 60.0, 60.0, 60.0, 60.0, 90.0, 90.0, 90...","[7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, ...","[5.0, 12.0, 12.0, 12.0, 12.0, 2.0, 2.0, 2.0, 2...","[31.265000000000047, 27.178666666666533, 10.17...","[19.321666666666605, 18.013999999999967, 8.400...",14559,14559,14559
2,u0wt7c1ysp7g,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, ...","[0.41889880952381, 0.4203869047619035, 0.42187...","[0.9322916666666677, 0.9427083333333348, 0.953...","[15.250000000000009, 11.377777777777784, 6.319...","[3.0, 3.0, 3.0, 2.100000000000005, 2.100000000...","[45.816666666666684, 52.71111111111114, 65.900...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1.2999999999999987, 1.3000000000000045, 1.300...","[100.0, 100.0, 100.0, 160.0, 160.0, 160.0, 160...","[0.0, 0.0, 0.0, 3.0, 3.0, 3.0, 3.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[29.142857142857114, 138.89733333333314, 155.8...","[23.90285714285711, 69.13666666666668, 87.3494...",29414,29414,29414
3,u0wt2r7hy32m,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.11086309523809502, 0.11235119047618992, 0.1...","[0.7760416666666677, 0.7864583333333288, 0.796...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[13.20000000000003, 13.199999999999884, 12.899...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1.3000000000000012, 1.2999999999999956, 1.399...","[210.0, 210.0, 170.0, 170.0, 170.0, 170.0, 170...","[7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[47.554999999999964, 31.473749999999793, 26.11...","[26.477500000000077, 17.70500000000004, 14.729...",13599,13599,13599
4,u0wt2vf2reen,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[0.359375, 0.360863095238099, 0.36235119047619...","[0.515625, 0.5260416666666639, 0.5364583333333...","[8.800000000000004, 9.481818181818191, 10.2749...","[7.400000000000004, 7.4000000000000545, 7.4000...","[64.09999999999997, 62.11818181818212, 59.6333...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.4000000000000002, 0.4000000000000031, 0.400...","[140.0, 140.0, 140.0, 90.0, 90.0, 90.0, 90.0, ...","[8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, 4.0, 4.0, ...","[0.0, 0.0, 0.0, 8.0, 8.0, 8.0, 8.0, 60.0, 60.0...","[102.17000000000004, 77.09066666666722, 73.599...","[39.17999999999999, 34.61133333333335, 33.5008...",10097,10097,10097
5,u0wt3pg0rnpp,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[0.10788690476190499, 0.109375, 0.110863095238...","[0.7552083333333316, 0.765625, 0.7760416666666...","[5.600000000000004, 5.6666666666666865, 5.6874...","[-0.19999999999999948, -0.20000000000000115, -...","[54.65000000000007, 54.78333333333331, 54.0124...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.1999999999999917, 3.2000000000000184, 3.200...","[170.0, 170.0, 170.0, 170.0, 150.0, 150.0, 150...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[45.639999999999965, 46.829999999999906, 50.03...","[18.69000000000003, 18.987499999999955, 20.339...",11564,11564,11564
6,u0wt5kwvne50,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.6540178571428558, 0.6555059523809531, 0.656...","[0.578125, 0.5885416666666662, 0.5989583333333...","[nan, 8.783333333333339, 9.022222222222211, 8....","[3.8000000000000065, 4.400000000000003, 4.4000...","[nan, 51.64999999999998, 51.26666666666646, 51...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[2.9000000000000017, 1.8999999999999915, 1.899...","[110.0, 120.0, 120.0, 120.0, 120.0, 100.0, 100...","[8.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, ...","[0.0, 38.0, 38.0, 38.0, 38.0, 56.0, 56.0, 56.0...","[38.559999999999995, 41.058, 38.43624999999996...","[19.44000000000001, 20.218999999999962, 19.578...",33542,33542,33542
7,u0wtdcbqk5py,2016,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[0.7924107142857115, 0.7938988095238037, 0.795...","[0.546875, 0.557291666666665, 0.56770833333333...","[nan, 9.099999999999996, 4.840000000000021, 2....","[0.5999999999999988, 0.5999999999999971, 0.599...","[nan, 32.35000000000001, 42.24000000000008, 49...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1.0999999999999968, 1.1000000000000045, 1.100...","[100.0, 100.0, 100.0, 100.0, 60.0, 60.0, 60.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60....","[17.564000000000004, 17.706363636363687, 17.93...","[13.641999999999962, 14.494545454545523, 13.96...",10357,10357,10357
8,u0wmtd2zuf7q,2016,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 5, 5, 2, 2, 2, 3, 3, 5, 5, 5, 5, 6, 6, 6, ...","[0.8072916666666661, 0.8087797619047612, 0.817...","[0.6510416666666665, 0.6614583333333336, 0.723...","[26.600000000000016, 26.600000000000016, 26.39...","[15.700000000000019, 15.700000000000019, 14.79...","[44.799999999999976, 44.700000000000024, 44.90...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.4000000000000035, 3.4000000000000035, 3.599...","[220.0, 220.0, 220.0, 40.0, 40.0, 30.0, 350.0,...","[8.0, 8.0, 6.0, 2.0, 3.0, 5.0, 8.0, 8.0, 8.0, ...","[0.0, 0.0, 1.0, 53.0, 59.0, 48.0, 0.0, 0.0, 0....","[7.430000000000005, 5.449999999999998, 4.81999...","[5.979999999999999, 5.069999999999996, 3.37000...",34434,34434,34434
9,u0wtt09vtdqf,2016,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, ...","[0.985863095238095, 0.9873511904761907, 0.9888...","[0.9010416666666664, 0.9114583333333317, 0.921...","[nan, 15.144444444444405, 15.149999999999991, ...","[14.400000000000004, 14.400000000000034, 14.30...","[nan, 94.5333333333334, 95.19999999999993, 94....",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[2.3, 2.300000000000006, 2.5, 2.5, 2.5, 2.5, 2...","[310.0, 310.0, 290.0, 290.0, 290.0, 290.0, 280...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[62.01999999999999, 60.03571428571418, 68.1672...","[34.22000000000001, 33.874285714285726, 36.789...",16199,16199,16199
