Realizamos los imports necesarios e iniciamos una SparkSession

In [None]:
import $ivy.`org.apache.spark::spark-sql:2.4.5` 
import $ivy.`sh.almond::almond-spark:0.4.0`

import org.apache.spark.sql.{NotebookSparkSession, SparkSession}
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._

val spark = NotebookSparkSession
      .builder()
      .config("spark.sql.join.preferSortMergeJoin", false)
      //.config("spark.sql.shuffle.partitions", 64)
      .master("local[*]")
      .getOrCreate()

import spark.implicits._

import org.slf4j.LoggerFactory
import org.apache.log4j.{Level, Logger}

Logger.getRootLogger().setLevel(Level.ERROR)

In [None]:
val data: DataFrame = spark.read.option("multiline", "true").json("D:/TFGAlvaroSanchez/data2/*.json")

In [None]:
data.show()

### Obtencion de la fecha y del valor maximo de temperatura

Con lo siguiente cambiamos el tipo de datos de la columna ta_max

val data1 = data.withColumn("ta_max", $"ta_max".substr(0,4).cast(IntegerType))

In [None]:
import org.apache.spark.sql.functions.{min, max, desc}

data.withColumn("ta_max", $"ta_max".substr(0,4).cast(IntegerType))
    .select($"fecha", $"ta_max" as "temperatura maxima")
    .orderBy($"ta_max".desc)
    .show(1)

### Obtencion del mes, año, id estacion y provincia donde ocurren la stemperaturas mas altas en 2021

In [None]:
val data2021 : DataFrame = spark.read.option("multiline", "true").json("D:/TFGAlvaroSanchez/data2/*2021.json")

In [None]:
data2021.printSchema()

In [None]:
data2021.select($"ta_min", $"indicativo", $"fecha")
    .filter($"indicativo".equalTo("2916A"))
    .show()

Usamos `.df` para darle nombre a las columnas

In [None]:
val ids : DataFrame = spark.read.option("delimiter", ";").csv("D:/TFGAlvaroSanchez/data2/aemetID.csv")
    .toDF("provincia", "indicativo", "ubicacion")

In [None]:
ids.show()
ids.printSchema()

In [None]:
val joindata : DataFrame = data2021.join(ids, "indicativo")

In [None]:
joindata.withColumn("ta_max", $"ta_max".substr(0,4).cast(IntegerType))
    .select($"fecha".substr(6,2) as "mes", $"fecha".substr(0,4) as "año", $"ta_max" as "temperatura maxima", $"indicativo", $"provincia")
    .filter(!($"mes".equalTo("13")))
    .orderBy($"ta_max".desc)
    .show()

### Obtencion del mes, año y provincia donde se encuntra la temperatura mas baja durante verano (junio, julio y agosto)

In [None]:
joindata.withColumn("ta_min", $"ta_min".substr(0,4).cast(IntegerType))
    .withColumn("fecha", $"fecha".cast(DateType)).printSchema

Para quedarnos unicamente con la temperatura ya que este valor viene precedido del dia que ocurrio de la siguiente manera: temperatura(dia)

In [None]:
import org.apache.spark.sql.functions.split

joindata.withColumn("ta_min", split($"ta_min", "\\("))
    .select($"ta_min"(0) as "val")
    .show()

In [None]:
joindata.withColumn("ta_min", split($"ta_min", "\\("))
    .withColumn("ta_min", $"ta_min"(0).cast(IntegerType))
    .withColumn("fecha", $"fecha".cast(DateType))
    .select(func.month($"fecha") as "mes", func.year($"fecha") as "año", $"ta_min" as "temperatura minima", $"provincia")  
    .filter($"mes" > 5 && $"mes" < 9)
    .orderBy($"temperatura minima".asc)
    .show()

### Numero de meses con temperaturas maximas >30º por provincia

In [None]:
joindata
    .withColumn("ta_max", split($"ta_max", "\\("))
    .withColumn("ta_max", $"ta_max"(0).cast(IntegerType))
    .withColumn("fecha", $"fecha".cast(DateType))
    .filter($"ta_max" > 30 && !func.isnull($"fecha"))//> "0000-00-00")
    .groupBy("provincia")
    .count
    .show()

Relacion que tiene el calor con los meses de verano

In [None]:
joindata
    .withColumn("ta_max", split($"ta_max", "\\("))
    .withColumn("ta_max", $"ta_max"(0).cast(IntegerType))
    .withColumn("fecha", $"fecha".cast(DateType))
    .filter($"ta_max" > 30 && !func.isnull($"fecha"))
    .withColumn("mes", func.month($"fecha"))
    .stat.corr("ta_max", "mes")

### Uso de graficos mostrando la precipitacion mensual por meses y años en Vitigudino(Salamanca)

In [None]:
val vitData : DataFrame = spark.read.option("multiline", "true").json("D:/TFGAlvaroSanchez/data2/2916A*.json")

Obtenemos los datos que queremos mostrar

In [None]:
val vitDataRequired : DataFrame = vitData
    .withColumn("fecha", $"fecha".cast(DateType))
    .withColumn("p_mes", $"p_mes".cast(IntegerType))
    .filter(!func.isnull($"fecha") && $"p_mes" >= 0)
    .select($"fecha", $"p_mes")
    .orderBy($"fecha".asc)

In [None]:
vitDataRequired.show()

In [None]:
val vitDataRequired_f : Seq[String] = vitDataRequired.select($"fecha").as[String].collect.toSeq
val vitDataRequired_p : Seq[Int] = vitDataRequired.select($"p_mes").as[Int].collect.toSeq

Realizamos los imports necesarios para las graficas

In [None]:
import $ivy.`org.plotly-scala::plotly-almond:0.7.0`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._

// restrict the output height to avoid scrolling in output cells
repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)

In [None]:
val trace = Seq(
    Scatter(
        vitDataRequired_f,
        vitDataRequired_p,
        fill = Fill.ToZeroY,
        marker = Marker(
            color = Color.RGBA(55, 128, 191, 0.6)
        )
    )
)


val layout = Layout(
    title = "Precipitacion mensual (mm)",
    paper_bgcolor =  Color.RGBA(245, 246, 249, 1),
    plot_bgcolor = Color.RGBA(245, 246, 249, 1),
)
plot(trace, layout)

### Leer los datos de manera casteada mediante un schema

In [None]:
val schema = StructType(
                Array(
                    /*StructField("fecha", DateType, true),
                    StructField("indicativo", StringType, true),
                    StructField("p_max", StringType, true),*/
                    StructField("glo", IntegerType, true),
                    /*StructField("hr", IntegerType, true),
                    StructField("nw_55", IntegerType, true),
                    StructField("tm_min", DoubleType, true),
                    StructField("ta_max", StringType, true),
                    StructField("ts_min", DoubleType, true),
                    StructField("nt_30", IntegerType, true),
                    StructField("n_des", IntegerType, true),
                    StructField("w_racha", StringType, true),
                    StructField("np_100", IntegerType, true)*/
                )
            )

In [None]:
val dt: DataFrame = spark.read.schema(schema)
    .option("multiline", "true")
    .option("dateFormat", "yyyy-MM")
    .json("D:/TFGAlvaroSanchez/data2/0201D(Barcelona)-2021.json")

In [None]:
dt.show()

In [None]:
val dt: DataFrame = spark.read
    .option("multiline", "true")
    .json("D:/TFGAlvaroSanchez/data2/0201D(Barcelona)-2021.json")
    .select(//$"fecha".cast(DateType), 
            func.to_date($"fecha").alias("fecha"),
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType))

In [None]:
dt.printSchema()

In [None]:
dt.show()

### Temperatura media en españa

In [None]:
val data: DataFrame = spark.read
    .option("multiline", "true")
    .json("D:/TFGAlvaroSanchez/data2/*.json")
    .select($"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType))

In [None]:
data.filter($"fecha".isNotNull)
    .select($"tm_mes".alias("temperatura media"))
    .describe()
    .show()

### Fecha y estaciones con temp >35º (Utilizando expr)

In [None]:
val data: DataFrame = spark.read
    .option("multiline", "true")
    .json("D:/TFGAlvaroSanchez/data2/*.json")
    .select($"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType))
val dataWithNameStation = data.join(ids, "indicativo")

In [None]:
dataWithNameStation.withColumn("ta_max", split($"ta_max", "\\("))
    .withColumn("t_max", $"ta_max"(0).cast(IntegerType))
    .withColumn("dia", func.substring($"ta_max"(1), 0, 2).cast(IntegerType))
    .withColumn("calor", func.expr("t_max > 35"))
    .filter($"calor" && $"fecha".isNotNull)
    .select($"ubicacion", func.year($"fecha").alias("año"), func.month($"fecha").alias("mes"), $"dia")
    .show(false)

### Mostrando graficamente la precipitacion mensual en diferentes estaciones meterologicas

In [None]:
val data2021 = spark.read.option("multiline", "true").json("D:/TFGAlvaroSanchez/data2/*-2021.json")

In [None]:
data2021.write.partitionBy("indicativo").saveAsTable("indicativos")

### Media de todos los datos, agrupados por fecha y por indicativo

In [None]:
val data2021: DataFrame = spark.read
    .option("multiline", "true")
    .json("D:/TFGAlvaroSanchez/data2/*-2021.json")
    .select($"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType))

In [None]:
val pivotedData2021 = data2021.groupBy(func.year($"fecha")).pivot("indicativo").avg()
pivotedData2021.select("year(fecha)", "0201D_avg(tm_mes)").show()

### Leer todos los datos

In [None]:
val all = spark.read.option("multiline", "true").json("D:/TFGAlvaroSanchez/data/*.json").select(
            $"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType))

In [None]:
all.filter($"indicativo" === "2422").show()