In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import findspark
findspark.init()

import collections
import numpy as np
import pandas as pd
import datetime as dt
import os.path as osp
import matplotlib.pylab as plt
import seaborn as sns
import sklearn.preprocessing as prep
import sklearn.feature_selection as sfsel
import sklearn.tree as stree
import sklearn.ensemble as sensemble

from pyspark import keyword_only
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
sess = SparkSession.builder \
                   .master("local[*]") \
                   .config("spark.driver.memory", "32g") \
                   .getOrCreate()

In [5]:
test_df = sess.read.parquet("./stgt_sensors_with_date_geo_dwd_series.parquet/")
test_df.printSchema()
print(test_df.count())

root
 |-- hash: string (nullable = true)
 |-- start_year: integer (nullable = true)
 |-- weekend: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- holiday: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- day_of_week: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- day_num: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- time_num: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- temperature: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_temperature: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- humidity: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_precipitation_height: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- pressure: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- dwd_wind_spe

In [8]:
test_df.withColumn("l", length("hash")).select("l").distinct().toPandas()

Unnamed: 0,l
0,12


In [19]:
test_df.columns

['hash',
 'start_year',
 'weekend',
 'holiday',
 'day_of_week',
 'day_num',
 'time_num',
 'temperature',
 'dwd_temperature',
 'humidity',
 'dwd_precipitation_height',
 'pressure',
 'dwd_wind_speed',
 'dwd_wind_direction',
 'dwd_clouds',
 'dwd_sun',
 'P1',
 'P2']

In [36]:
test_df.withColumn("hash_bin", substring("hash", 0, 5)) \
       .groupBy("hash_bin") \
       .agg(
         max('start_year').alias("start_year"),
         collect_list('weekend').alias('weekend'),
         collect_list('holiday').alias('holiday'),
         collect_list('day_num').alias('day_num'),
         collect_list('temperature').alias("temperature"),
         collect_list('dwd_temperature').alias("dwd_temperature"),
         collect_list('humidity').alias("humidity"),
         collect_list('dwd_precipitation_height').alias("dwd_precipitation_height"),
         collect_list('pressure').alias("pressure"),
         collect_list('dwd_wind_speed').alias("dwd_wind_speed"),
         collect_list('dwd_wind_direction').alias("dwd_wind_direction"),
         collect_list('dwd_clouds').alias("dwd_clouds"),
         collect_list('dwd_sun').alias("dwd_sun"),
         collect_list('P1').alias("P1"),
         collect_list('P2').alias("P2")
        ) \
        .limit(1).toPandas()

Unnamed: 0,hash_bin,start_year,weekend,holiday,day_num,temperature,dwd_temperature,humidity,dwd_precipitation_height,pressure,dwd_wind_speed,dwd_wind_direction,dwd_clouds,dwd_sun,P1,P2
0,u0wsj,2017,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.6495535714285714, 0.6510416666666669, 0.65...","[[25.166666666666668, 25.8, 26.183333333333337...","[[21.899999999999995, 21.900000000000002, 21.9...","[[40.800000000000004, 40.15, 38.5, 39.31666666...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[nan, nan, nan, nan, nan, nan, nan, nan, nan,...","[[3.7000000000000006, 3.6999999999999997, 3.69...","[[260.0, 260.0, 260.0, 260.0, 250.0, 250.0, 25...","[[7.0, 7.0, 7.0, 7.0, 6.0, 6.0, 6.0, 6.0, 3.0,...","[[37.0, 37.0, 37.0, 37.0, 47.0, 47.0, 47.0, 47...","[[17.776666666666667, 26.673333333333336, 11.6...","[[10.623333333333335, 12.594999999999999, 4.87..."


In [33]:
test_df.toPandas()

Unnamed: 0,hash,start_year,weekend,holiday,day_of_week,day_num,time_num,temperature,dwd_temperature,humidity,dwd_precipitation_height,pressure,dwd_wind_speed,dwd_wind_direction,dwd_clouds,dwd_sun,P1,P2
0,u0wsyx2s1j08,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[0.28199404761904756, 0.28348214285714285, 0.2...","[0.9739583333333335, 0.984375, 0.9947916666666...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[8.200000000000001, 8.200000000000001, 8.20000...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[4.3999999999999995, 4.3999999999999995, 4.399...","[100.0, 100.0, 100.0, 130.0, 130.0, 130.0, 130...","[7.0, 7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 3.0, 3.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[27.560833333333335, 32.675000000000004, 43.64...","[18.695833333333336, 20.548571428571428, 23.77..."
1,u0wt2ryn72xj,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.671875, 0.673363095238089, 0.67485119047619...","[0.703125, 0.7135416666666732, 0.7239583333333...","[22.5, 22.699999999999832, 22.819230769230742,...","[20.299999999999972, 20.2000000000002, 20.2000...","[60.86666666666662, 60.73000000000012, 59.6576...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1.0999999999999992, 1.1000000000000132, 1.100...","[170.0, 150.0, 150.0, 150.0, 150.0, 110.0, 110...","[8.0, 8.0, 8.0, 8.0, 8.0, 6.0, 6.0, 6.0, 6.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 20.0, 20.0, 20.0, 20...","[9.380000000000003, 6.920476190476227, 7.26222...","[6.405000000000006, 5.202380952380942, 5.06888..."
2,u0wt3q71zdcj,2016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...","[0.6436011904761901, 0.6450892857142856, 0.646...","[0.5052083333333336, 0.515625, 0.5260416666666...","[5.699999999999998, 5.649999999999996, 5.80555...","[3.5, 3.5, 3.5, 3.5, 3.7999999999999763, 3.799...","[44.799999999999976, 44.56250000000021, 44.766...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.0999999999999988, 3.1000000000000143, 3.099...","[130.0, 130.0, 130.0, 130.0, 110.0, 110.0, 110...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 7.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 38.0,...","[nan, 37.55111111111111, 39.41631578947387, 39...","[nan, 17.054444444444496, 17.901052631578988, ..."
3,u0wt5xnbyfmh,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[0.3117559523809524, 0.31324404761904756, 0.31...","[0.18229166666666666, 0.19270833333333334, 0.2...","[20.0, 14.484615384615381, 5.8066666666666675,...","[3.5, 3.5, 3.5, 3.1999999999999993, 3.2, 3.2, ...","[40.199999999999996, 43.14615384615385, 73.26,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[1.4000000000000001, 1.4, 1.3999999999999997, ...","[190.0, 190.0, 190.0, 160.0, 160.0, 160.0, 160...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.8883333333333336, 11.128461538461538, 14.90...","[2.686666666666667, 9.886923076923077, 12.826,..."
4,u0wkx07nkxj7,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.10193452380952382, 0.10342261904761905, 0.1...","[0.7135416666666667, 0.7239583333333333, 0.734...","[28.028571428571432, 27.000000000000004, 22.25...","[14.0, 14.0, 14.0, 14.0, 13.5, 13.5, 13.5, 13....","[36.25714285714286, 38.68749999999999, 40.3333...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[4.6000000000000005, 4.6000000000000005, 4.600...","[340.0, 340.0, 340.0, 340.0, 350.0, 350.0, 350...","[2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 7.0, ...","[9.0, 9.0, 9.0, 9.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, 4.785, 5.58153846153...","[nan, nan, nan, nan, nan, 3.66, 4.863076923076..."
5,u0wmj205mnxm,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.6555059523809524, 0.6569940476190476, 0.658...","[0.5885416666666666, 0.5989583333333334, 0.609...","[35.15, 35.06666666666667, 35.166666666666664,...","[28.400000000000002, 28.400000000000002, 28.40...","[31.849999999999998, 32.650000000000006, 32.6,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[4.6000000000000005, 4.6000000000000005, 4.600...","[260.0, 260.0, 260.0, 260.0, 250.0, 250.0, 250...","[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[60.0, 60.0, 60.0, 60.0, 56.0, 56.0, 56.0, 56....","[5.3133333333333335, 6.086666666666667, 5.7716...","[4.405, 4.416666666666667, 4.4433333333333325,..."
6,u0wmp82r05e0,2017,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[0.953125, 0.954613095238095, 0.95610119047619...","[0.671875, 0.6822916666666666, 0.6927083333333...","[9.816666666666666, 9.286666666666665, 8.94375...","[7.3, 7.299999999999998, 7.299999999999998, 7....","[53.449999999999996, 55.16, 56.506249999999994...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.1999999999999997, 3.2000000000000006, 3.200...","[220.0, 220.0, 220.0, 220.0, 230.0, 230.0, 230...","[7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, ...","[23.0, 23.0, 23.0, 23.0, 2.0, 2.0, 2.0, 2.0, 0...","[12.038333333333334, 12.121999999999998, 12.71...","[10.186666666666667, 10.136666666666665, 10.35..."
7,u0wmr07cegr1,2017,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.5691964285714287, 0.5706845238095238, 0.572...","[0.984375, 0.9947916666666665, 0.0052083333333...","[16.88, 16.958333333333332, 16.642857142857146...","[11.299999999999999, 11.299999999999999, 11.79...","[62.315, 61.905, 62.36714285714286, 61.8016666...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[94930.19, 94943.46500000001, 94974.7585714285...","[0.7999999999999999, 0.7999999999999999, 1.400...","[300.0, 300.0, 310.0, 310.0, 310.0, 310.0, 250...","[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.16, 3.31, 2.974285714285714, 3.650000000000...","[2.9333333333333336, 2.953333333333333, 2.6857..."
8,u0wqx3z9bc9d,2017,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, ...","[0.9486607142857143, 0.9501488095238094, 0.951...","[0.640625, 0.6510416666666666, 0.6614583333333...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[25.8, 25.8, 25.8, 26.0, 26.0, 26.0, 26.0, 26....","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.9, 3.9, 3.9, 3.9, 3.9, 3.9, 3.8999999999999...","[80.0, 80.0, 80.0, 90.0, 90.0, 90.0, 90.0, 80....","[3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, ...","[60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60....","[4.6933333333333325, 4.895, 4.278333333333333,...","[1.5116666666666667, 1.6733333333333331, 1.884..."
9,u0wt17ksq4hs,2017,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[0.7938988095238095, 0.7953869047619048, 0.796...","[0.5572916666666666, 0.5677083333333334, 0.578...","[24.700000000000003, 24.599999999999998, 22.41...","[19.1, 19.099999999999998, 19.099999999999998,...","[44.1, 45.199999999999996, 49.633333333333326,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[3.9, 3.9, 3.9, 5.3999999999999995, 5.39999999...","[310.0, 310.0, 310.0, 300.0, 300.0, 300.0, 300...","[7.0, 7.0, 7.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...","[42.0, 42.0, 42.0, 44.0, 44.0, 44.0, 44.0, 38....","[5.835, 6.333333333333333, 4.421666666666667, ...","[2.135, 1.92, 1.8399999999999999, 1.7433333333..."
