In [None]:
import $ivy.`org.apache.spark::spark-sql:2.4.5` 
import $ivy.`sh.almond::almond-spark:0.4.0`

import org.apache.spark.sql.{NotebookSparkSession, SparkSession}
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._

val spark = NotebookSparkSession
      .builder()
      .config("spark.sql.join.preferSortMergeJoin", false)
      .config("spark.sql.shuffle.partitions", 64)
      .master("local[8]")
      .getOrCreate()

import spark.implicits._

import org.slf4j.LoggerFactory
import org.apache.log4j.{Level, Logger}

Logger.getRootLogger().setLevel(Level.ERROR)

### Tranformacion de JSON multiline a JSON line

Para esto lo que he hecho ha sido ejecutar en el CMD el siguiente comando `FOR %a IN (../data/*.json) DO jq . -c "%a" > "../JSONLine/%a"` con el que vamos recorriendo la carpeta donde se encuentran los ficheros JSON multiline y los vamos guardando con la transformacion en la carpeta JSONLine

### Leer y pasar datos mensuales a Parquet

In [None]:
val schema = StructType(
    Array(
        StructField("fecha", StringType, true), 
        StructField("indicativo", StringType, true),
        StructField("nombre", StringType, true),
        StructField("provincia", StringType, true),
        StructField("altitud", StringType, true),
        StructField("tm_mes", StringType, true),
        StructField("tm_max", StringType, true),
        StructField("tm_min", StringType, true), 
        StructField("ta_max", StringType, true),
        StructField("ta_min", StringType, true),
        StructField("ts_min", StringType, true),
        StructField("ti_max", StringType, true),
        StructField("nt_30", StringType, true),
        StructField("nt_00", StringType, true),
        StructField("p_mes", StringType, true),
        StructField("p_max", StringType, true),
        StructField("np_001", StringType, true),
        StructField("np_010", StringType, true),
        StructField("np_100", StringType, true),
        StructField("np_300", StringType, true),
        StructField("hr", StringType, true), 
        StructField("e", StringType, true),
        StructField("n_llu", StringType, true),
        StructField("n_nie", StringType, true),
        StructField("n_gra", StringType, true),
        StructField("n_tor", StringType, true),
        StructField("n_fog", StringType, true),
        StructField("n_des", StringType, true),
        StructField("n_nub", StringType, true),
        StructField("n_cub", StringType, true),
        StructField("inso", StringType, true),
        StructField("p_sol", StringType, true),
        StructField("glo", StringType, true),
        StructField("evap", StringType, true),
        StructField("w_rec", StringType, true),
        StructField("w_racha", StringType, true),
        StructField("nw_55", StringType, true),
        StructField("nw_91", StringType, true),
        StructField("w_med", StringType, true),
        StructField("q_med", StringType, true),
        StructField("q_max", StringType, true),
        StructField("q_min", StringType, true),
        StructField("q_mar", StringType, true),
        StructField("ts_10", StringType, true),
        StructField("ts_20", StringType, true),
        StructField("ts_50", StringType, true),
        StructField("nv_0050", StringType, true),
        StructField("nv_0100", StringType, true),
        StructField("nv_1000", StringType, true)       
    )
)

In [None]:
val allData = spark.read.schema(schema).json("D:/TFGAlvaroSanchez/data/monthJSONLine/*.json")
    .select(
            $"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType)
    )

In [None]:
allData.write.format("parquet").partitionBy("indicativo").mode("overwrite").save("D:/TFGAlvaroSanchez/data/monthParquet/")

### Leer y pasar datos diarios a Parquet

In [None]:
val schema = StructType(
    Array(
        StructField("fecha", StringType, true), 
        StructField("indicativo", StringType, true),
        StructField("nombre", StringType, true),
        StructField("provincia", StringType, true),
        StructField("altitud", StringType, true),
        StructField("tmed", StringType, true),
        StructField("prec", StringType, true),
        StructField("tmin", StringType, true),
        StructField("horatmin", StringType, true),
        StructField("tmax", StringType, true),
        StructField("horatmax", StringType, true),
        StructField("dir", StringType, true),
        StructField("velmedia", StringType, true),
        StructField("racha", StringType, true),
        StructField("horaracha", StringType, true),
        StructField("sol", StringType, true),
        StructField("presMax", StringType, true),
        StructField("horaPresMax", StringType, true),
        StructField("presMin", StringType, true),
        StructField("horaPresMin", StringType, true)
    )
)

In [None]:
val allData = spark.read.schema(schema).json("D:/TFGAlvaroSanchez/data/dayJSONLine/*.json")
    .withColumn("fecha", $"fecha".cast(DateType))
    .withColumn("altitud", $"altitud".cast(IntegerType))
    .withColumn("tmed", func.regexp_replace($"tmed", ",", ".").cast(DoubleType))
    .withColumn("prec", func.regexp_replace($"prec", ",", ".").cast(DoubleType))
    .withColumn("tmin", func.regexp_replace($"tmin", ",", ".").cast(DoubleType))
    .withColumn("tmax", func.regexp_replace($"tmax", ",", ".").cast(DoubleType))
    .withColumn("dir", $"dir".cast(IntegerType))
    .withColumn("velmedia", func.regexp_replace($"velmedia", ",", ".").cast(DoubleType))
    .withColumn("racha", func.regexp_replace($"racha", ",", ".").cast(DoubleType))
    .withColumn("sol", func.regexp_replace($"sol", ",", ".").cast(DoubleType))
    .withColumn("presMax", func.regexp_replace($"presMax", ",", ".").cast(DoubleType))
    .withColumn("horaPresMax", $"horaPresMax".cast(IntegerType))
    .withColumn("presMin", func.regexp_replace($"presMin", ",", ".").cast(DoubleType))
    .withColumn("horaPresMin", $"horaPresmin".cast(IntegerType))

In [None]:
allData.write.format("parquet").partitionBy("indicativo").mode("overwrite").save("D:/TFGAlvaroSanchez/data/dayParquet/")

### Año de la temperatura máxima promedio en cada mes

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.7.0`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._

// restrict the output height to avoid scrolling in output cells

repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)

In [None]:
import org.apache.spark.sql.expressions.Window

val data = spark.read.parquet("D:/TFGAlvaroSanchez/data/monthParquet/*").na.drop()

val window = Window.partitionBy("mes").orderBy($"ta_max".desc)

val dataWindow = data
    .withColumn("ta_max", func.split($"ta_max", "\\(")(0).cast(DoubleType))
    .groupBy($"fecha")
    .agg(func.avg($"ta_max").alias("ta_max"))
    .select(func.month($"fecha").alias("mes"), func.year($"fecha").alias("año"), $"ta_max")
    .withColumn("dense_rank", func.dense_rank().over(window))
    .filter($"dense_rank" === 1)

In [None]:
def countByYear(year : Int) : Long = {
    dataWindow
        .filter($"año" === year)
        .count()
}
 
val year2010 = countByYear(2010)
val year2011 = countByYear(2011)
val year2012 = countByYear(2012)
val year2013 = countByYear(2013)
val year2014 = countByYear(2014)
val year2015 = countByYear(2015)
val year2016 = countByYear(2016)
val year2017 = countByYear(2017)
val year2018 = countByYear(2018)
val year2019 = countByYear(2019)
val year2020 = countByYear(2020)
val year2021 = countByYear(2021)
val year2022 = countByYear(2022)

In [None]:
dataWindow
    .withColumn("temperatura maxima", func.round($"ta_max", 2))
    .orderBy($"mes".asc)
    //.select($"mes", $"año", $"temperatura maxima")
    .show()

val dataToPlot = Seq(
  Bar(
    Seq(2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022),
    Seq(year2010, year2011, year2012, year2013, year2014, year2015, year2016, 
        year2017, year2018, year2019, year2020, year2021, year2022),
    marker = Marker(
        color = Color.RGB(255, 174, 0),
        opacity = 0.6,
        line = Line(
            color = Color.RGB(189, 129, 0),
              width = 1.5
        )
    )
  )
)

plot(dataToPlot)

### Olas de calor

In [None]:
import org.apache.spark.sql.expressions.Window

val data = spark.read.parquet("D:/TFGAlvaroSanchez/data/dayParquet/")
val stations = spark.read.option("delimiter", ";").csv("D:/TFGAlvaroSanchez/data/aemetID.csv")
    .toDF("provincia", "indicativo", "ubicacion")

val window = Window.partitionBy($"indicativo", $"año").orderBy($"fecha")

val results = data
    .filter(!func.isnull($"tmax") && $"tmax" >= 40)
    .withColumn("año", func.year($"fecha"))
    .withColumn("n_fila", func.row_number().over(window))
    .withColumn("id", func.expr("date_sub(fecha, n_fila)"))
    .groupBy($"indicativo", $"año", $"id")
    .agg(func.count($"id").alias("dias"), func.avg($"tmax"), func.max($"tmax"), func.min($"tmax"))
    .filter($"dias" > 3)
    .join(stations, "indicativo")
    .select($"ubicacion", $"provincia", $"año", $"dias", func.round($"avg(tmax)", 2).alias("avg(tmax)"), 
            $"max(tmax)", $"min(tmax)")

In [None]:
val resultsSave = results
    .groupBy($"provincia", $"año")
    .agg(func.count($"provincia"), func.avg($"dias"), func.avg($"avg(tmax)"), func.max($"max(tmax)"), func.min($"min(tmax)"))
    .select($"provincia", $"año", $"count(provincia)".alias("nº de olas de calor"), 
            $"avg(dias)".alias("duracion media"), $"avg(avg(tmax))".alias("avg(tmax)"), 
            $"max(max(tmax))".alias("max(tmax)"), $"min(min(tmax))".alias("min(tmax)"))
    
resultsSave
    .withColumnRenamed("nº de olas de calor", "nOlasCalor")
    .withColumnRenamed("duracion media", "duracionMedia")
    .withColumnRenamed("avg(tmax)", "avgTmax")
    .withColumnRenamed("max(tmax)", "maxTmax")
    .withColumnRenamed("min(tmax)", "minTmax")
    .write.format("parquet").partitionBy("provincia").mode("overwrite").save("D:/TFGAlvaroSanchez/data/resultadoOlasCalor/")

In [None]:
results
    .withColumnRenamed("dias", "duracion (dias)")
    .withColumnRenamed("avg(tmax)", "temperatura media")
    .withColumnRenamed("max(tmax)", "temperatura maxima")
    .withColumnRenamed("min(tmax)", "temperatura minima")
    .orderBy($"ubicacion", $"año")
    .show()

In [None]:
results
    .groupBy($"año")
    .agg(func.count($"año").alias("nº olas de calor"))
    .orderBy($"año")
    .show()

Para observar los resultados de forma gráfica en el mapa acceder al notebook `RepresentacionOlasCalor`