In [None]:
import $ivy.`org.apache.spark::spark-sql:2.4.5` 
import $ivy.`sh.almond::almond-spark:0.4.0`

import org.apache.spark.sql.{NotebookSparkSession, SparkSession}
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._

val spark = NotebookSparkSession
      .builder()
      .config("spark.sql.join.preferSortMergeJoin", false)
      .config("spark.sql.shuffle.partitions", 64)
      .master("local[4]")
      .getOrCreate()

import spark.implicits._

import org.slf4j.LoggerFactory
import org.apache.log4j.{Level, Logger}

Logger.getRootLogger().setLevel(Level.ERROR)

In [None]:
import org.apache.spark.sql.expressions.Window

val data = spark.read.parquet("../data/dayParquet/")
val stations = spark.read.option("delimiter", ";").csv("../data/aemetID.csv")
    .toDF("provincia", "indicativo", "ubicacion")

val window = Window.partitionBy($"indicativo", $"año").orderBy($"fecha")

val results = data
    .filter(!func.isnull($"tmax") && $"tmax" >= 40)
    .withColumn("año", func.year($"fecha"))
    .withColumn("n_fila", func.row_number().over(window))
    .withColumn("id", func.expr("date_sub(fecha, n_fila)"))
    .groupBy($"indicativo", $"año", $"id")
    .agg(func.count($"id").alias("dias"), func.avg($"tmax"), func.max($"tmax"), func.min($"tmax"))
    .filter($"dias" > 3)
    .join(stations, "indicativo")
    .select($"ubicacion", $"provincia", $"año", $"dias", func.round($"avg(tmax)", 2).alias("avg(tmax)"), 
            $"max(tmax)", $"min(tmax)")

In [None]:
val resultsSave = results
    .groupBy($"provincia", $"año")
    .agg(func.count($"provincia"), func.avg($"dias"), func.avg($"avg(tmax)"), func.max($"max(tmax)"), func.min($"min(tmax)"))
    .select($"provincia", $"año", $"count(provincia)".alias("nº de olas de calor"), 
            $"avg(dias)".alias("duracion media"), $"avg(avg(tmax))".alias("avg(tmax)"), 
            $"max(max(tmax))".alias("max(tmax)"), $"min(min(tmax))".alias("min(tmax)"))
    
resultsSave
    .withColumnRenamed("nº de olas de calor", "nOlasCalor")
    .withColumnRenamed("duracion media", "duracionMedia")
    .withColumnRenamed("avg(tmax)", "avgTmax")
    .withColumnRenamed("max(tmax)", "maxTmax")
    .withColumnRenamed("min(tmax)", "minTmax")
    .write.format("parquet").partitionBy("año").mode("overwrite").save("../data/resultadoOlasCalor/")

In [None]:
results
    .withColumnRenamed("dias", "duracion (dias)")
    .withColumnRenamed("avg(tmax)", "temperatura media")
    .withColumnRenamed("max(tmax)", "temperatura maxima")
    .withColumnRenamed("min(tmax)", "temperatura minima")
    .orderBy($"ubicacion", $"año")
    .show()

In [None]:
results
    .groupBy($"año")
    .agg(func.count($"año").alias("nº olas de calor"))
    .orderBy($"año")
    .show()

Para observar los resultados de forma gráfica en el mapa acceder al notebook `Olas de calor - Representación en mapa`