# NOAA : Calcule des pluviométries précédentes

Dataset :
- le resultat précédent

Exercice :
- Créer pour chaque ligne des colonnes contenant la (moyenne,somme,max) des 5, 10, 20 jours précédants

(PAYS, Date, avg_prcp, sum_rain) ajouter (avg_rain_last_5_days, avg_rain_last_10_days, sum_rain_last_5_days, max_rain_last_5_days)




Aide : https://kevinvecmanis.io/pyspark/data%20science/python/2019/06/02/SPX-Analysis-With-PySpark.html

In [2]:
#!hdfs dfs -ls /demo/noaa/noaa-raw-data

In [12]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType # Spark Date type
from pyspark.sql.window import Window
import shutil

In [4]:
DATASET_FOLDER = '/stagiaire/430-Benoit/noaa/daily_rain_by_country_ISO3'
SPARK_MASTER = 'spark://localhost:7077'
APP_NAME = 'NOAA Window - 430'

noaa_csv_path = DATASET_FOLDER 
output = '/stagiaire/votre_dossier/noaa/daily_rain_by_country_window'

# Create Spark session
spark = SparkSession.builder.master(SPARK_MASTER).appName(APP_NAME).getOrCreate()


In [10]:
df_full = spark.read.format('csv').option('header',True).option('multiLine', True).load(noaa_csv_path)\
.toDF('COUNTRY', 'country_ISO3', 'date', 'avg_rain', 'sum_rain', 'max_rain', 'cnt'
)
df_full.show(3)

+-------+------------+----------+------------------+--------+--------+---+
|COUNTRY|country_ISO3|      date|          avg_rain|sum_rain|max_rain|cnt|
+-------+------------+----------+------------------+--------+--------+---+
|     BE|         BEL|1980-05-19|               0.0|     0.0|       0|  1|
|     FR|         FRA|1980-05-19|14.136363636363637|  1244.0|     216| 88|
|     RI|         SRB|1980-05-19|45.333333333333336|   136.0|     132|  3|
+-------+------------+----------+------------------+--------+--------+---+
only showing top 3 rows



In [14]:
##############################################################################
###### Cast Type
print('Cast string to float')
df_full = df_full.withColumn("avg_rain", df_full["avg_rain"].cast(FloatType()))\
    .withColumn("sum_rain", df_full["sum_rain"].cast(FloatType())) \
    .withColumn("max_rain", df_full["max_rain"].cast(FloatType()))
print('Cast date to timestamp')
df_full = df_full.withColumn("date_with_time", F.to_timestamp(df_full.date, 'yyyy-MM-dd'))

print('Helper function to compute number of second in a day')
days = lambda d:d*24*60*60

Cast string to float
Cast date to timestamp
Helper function to compute number of second in a day


In [15]:

#############################################################################
##### Compute new features
# Thanks to  https://kevinvecmanis.io/pyspark/data%20science/python/2019/06/02/SPX-Analysis-With-PySpark.html
print('Define a window to make computation')
windowSpec = Window.partitionBy(['country_ISO3']).orderBy(F.col("date_with_time").cast("long"))

df_rolling = df_full
print('Compute moving average and sum')
for d in [5,10,20,30,60]:
    # Moving Sum of rain
    df_rolling = df_rolling.withColumn('last_'+str(d)+'_days_sum', F.sum("avg_rain").over(windowSpec.rangeBetween(-days(d), 0)))
    # Moving average of rain
    df_rolling = df_rolling.withColumn('last_'+str(d)+'_days_avg', F.avg("avg_rain").over(windowSpec.rangeBetween(-days(d), 0)))
    # Moving max of rain
    df_rolling = df_rolling.withColumn('last_'+str(d)+'_days_max', F.max("max_rain").over(windowSpec.rangeBetween(-days(d), 0)))

Define a window to make computation
Compute moving average and sum


In [19]:
df_rolling.createOrReplaceTempView("rain")
spark.sql("SELECT country_ISO3, date, avg_rain, last_30_days_avg FROM rain WHERE date IN ('2020-01-03', '2020-01-02')").show()


+------------+----------+---------+------------------+
|country_ISO3|      date| avg_rain|  last_30_days_avg|
+------------+----------+---------+------------------+
|         FRA|2020-01-02| 4.175676|30.452314675815643|
|         FRA|2020-01-03|2.3972602|30.486351931287395|
|         SRB|2020-01-02|      0.0|16.408602299228793|
|         SRB|2020-01-03|      0.0| 14.47311842826105|
|         TZA|2020-01-02|      6.5| 102.7915515322839|
|         TZA|2020-01-03|      0.0| 99.57649785087955|
+------------+----------+---------+------------------+



In [21]:

#############################################################################
##### Compute new features
# Thanks to  https://kevinvecmanis.io/pyspark/data%20science/python/2019/06/02/SPX-Analysis-With-PySpark.html
print('Define a window to make computation')
windowSpec = Window.orderBy(F.col("date_with_time").cast("long"))

df_rolling2 = df_full
print('Compute moving average and sum')
df_rolling2 = df_rolling.withColumn('last_30_days_avg', F.avg("avg_rain").over(windowSpec.rangeBetween(-days(d), 0))).withColumn('last_50_days_avg', F.avg("avg_rain").over(windowSpec.rangeBetween(-days(50), 0)))
df_rolling2.createOrReplaceTempView("rain2")
spark.sql("SELECT country_ISO3, date, avg_rain, last_30_days_avg, last_50_days_avg FROM rain2 WHERE date IN ('2020-01-03', '2020-01-02')").show()

Define a window to make computation
Compute moving average and sum
+------------+----------+---------+------------------+
|country_ISO3|      date| avg_rain|  last_30_days_avg|
+------------+----------+---------+------------------+
|         FRA|2020-01-02| 4.175676|50.278461182026696|
|         SRB|2020-01-02|      0.0|50.278461182026696|
|         TZA|2020-01-02|      6.5|50.278461182026696|
|         FRA|2020-01-03|2.3972602| 49.67558221473377|
|         SRB|2020-01-03|      0.0| 49.67558221473377|
|         TZA|2020-01-03|      0.0| 49.67558221473377|
+------------+----------+---------+------------------+



In [22]:
spark.sql("SELECT country_ISO3, date, avg_rain, last_30_days_avg, last_50_days_avg FROM rain2 WHERE date IN ('2020-01-03', '2020-01-02')").explain()

== Physical Plan ==
*(4) Project [country_ISO3#207, date#208, avg_rain#280, last_30_days_avg#820, last_50_days_avg#846]
+- *(4) Filter date#208 IN (2020-01-03,2020-01-02)
   +- Window [avg(cast(avg_rain#280 as double)) windowspecdefinition(_w0#847L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, -4320000, currentrow$())) AS last_50_days_avg#846], [_w0#847L ASC NULLS FIRST]
      +- *(3) Sort [_w0#847L ASC NULLS FIRST], false, 0
         +- *(3) Project [country_ISO3#207, date#208, avg_rain#280, last_30_days_avg#820, cast(date_with_time#304 as bigint) AS _w0#847L]
            +- Window [avg(cast(avg_rain#280 as double)) windowspecdefinition(_w0#821L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, -5184000, currentrow$())) AS last_30_days_avg#820], [_w0#821L ASC NULLS FIRST]
               +- *(2) Sort [_w0#821L ASC NULLS FIRST], false, 0
                  +- Exchange SinglePartition, true, [id=#784]
                     +- *(1) Project [TZA#193 AS country_ISO3#207, 1980-05-20#194 AS

In [16]:
df_rolling.show(3)

+-------+------------+----------+--------+--------+--------+---+-------------------+---------------+-----------------+---------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+-----------------+----------------+
|COUNTRY|country_ISO3|      date|avg_rain|sum_rain|max_rain|cnt|     date_with_time|last_5_days_sum|  last_5_days_avg|last_5_days_max|last_10_days_sum| last_10_days_avg|last_10_days_max|last_20_days_sum| last_20_days_avg|last_20_days_max|last_30_days_sum| last_30_days_avg|last_30_days_max|last_60_days_sum| last_60_days_avg|last_60_days_max|
+-------+------------+----------+--------+--------+--------+---+-------------------+---------------+-----------------+---------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+-----------------+----------------+----------------+-------

In [None]:
df_rolling.sql

In [17]:
df_rolling.explain()

== Physical Plan ==
*(17) Project [COUNTRY#206, country_ISO3#207, date#208, avg_rain#280, sum_rain#288, max_rain#296, cnt#212, date_with_time#304, last_5_days_sum#314, last_5_days_avg#326, last_5_days_max#339, last_10_days_sum#353, last_10_days_avg#368, last_10_days_max#384, last_20_days_sum#401, last_20_days_avg#419, last_20_days_max#438, last_30_days_sum#458, last_30_days_avg#479, last_30_days_max#501, last_60_days_sum#524, last_60_days_avg#548, last_60_days_max#573]
+- Window [max(max_rain#296) windowspecdefinition(country_ISO3#207, _w0#574L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, -5184000, currentrow$())) AS last_60_days_max#573], [country_ISO3#207], [_w0#574L ASC NULLS FIRST]
   +- *(16) Sort [country_ISO3#207 ASC NULLS FIRST, _w0#574L ASC NULLS FIRST], false, 0
      +- *(16) Project [COUNTRY#206, country_ISO3#207, date#208, avg_rain#280, sum_rain#288, max_rain#296, cnt#212, date_with_time#304, last_5_days_sum#314, last_5_days_avg#326, last_5_days_max#339, last_10_days_

In [None]:
sqlDF.write.csv(output)
print(sqlDF.columns)
sc.stop()