In [None]:
import $ivy.`org.apache.spark::spark-sql:2.4.5` 
import $ivy.`sh.almond::almond-spark:0.4.0`

import org.apache.spark.sql.{NotebookSparkSession, SparkSession}
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._

val spark = NotebookSparkSession
      .builder()
      .config("spark.sql.join.preferSortMergeJoin", false)
      .config("spark.sql.shuffle.partitions", 64)
      .master("local[4]")
      .getOrCreate()

import spark.implicits._

import org.slf4j.LoggerFactory
import org.apache.log4j.{Level, Logger}

Logger.getRootLogger().setLevel(Level.ERROR)

### Tranformacion de JSON multiline a JSON line

Para esto lo que he hecho ha sido ejecutar en el CMD el siguiente comando `FOR %a IN (../data/*.json) DO jq . -c "%a" > "../JSONLine/%a"` con el que vamos recorriendo la carpeta donde se encuentran los ficheros JSON multiline y los vamos guardando con la transformacion en la carpeta JSONLine

### Leer y pasar datos mensuales a Parquet

In [None]:
val schema = StructType(
    Array(
        StructField("fecha", StringType, true), 
        StructField("indicativo", StringType, true),
        StructField("nombre", StringType, true),
        StructField("provincia", StringType, true),
        StructField("altitud", StringType, true),
        StructField("tm_mes", StringType, true),
        StructField("tm_max", StringType, true),
        StructField("tm_min", StringType, true), 
        StructField("ta_max", StringType, true),
        StructField("ta_min", StringType, true),
        StructField("ts_min", StringType, true),
        StructField("ti_max", StringType, true),
        StructField("nt_30", StringType, true),
        StructField("nt_00", StringType, true),
        StructField("p_mes", StringType, true),
        StructField("p_max", StringType, true),
        StructField("np_001", StringType, true),
        StructField("np_010", StringType, true),
        StructField("np_100", StringType, true),
        StructField("np_300", StringType, true),
        StructField("hr", StringType, true), 
        StructField("e", StringType, true),
        StructField("n_llu", StringType, true),
        StructField("n_nie", StringType, true),
        StructField("n_gra", StringType, true),
        StructField("n_tor", StringType, true),
        StructField("n_fog", StringType, true),
        StructField("n_des", StringType, true),
        StructField("n_nub", StringType, true),
        StructField("n_cub", StringType, true),
        StructField("inso", StringType, true),
        StructField("p_sol", StringType, true),
        StructField("glo", StringType, true),
        StructField("evap", StringType, true),
        StructField("w_rec", StringType, true),
        StructField("w_racha", StringType, true),
        StructField("nw_55", StringType, true),
        StructField("nw_91", StringType, true),
        StructField("w_med", StringType, true),
        StructField("q_med", StringType, true),
        StructField("q_max", StringType, true),
        StructField("q_min", StringType, true),
        StructField("q_mar", StringType, true),
        StructField("ts_10", StringType, true),
        StructField("ts_20", StringType, true),
        StructField("ts_50", StringType, true),
        StructField("nv_0050", StringType, true),
        StructField("nv_0100", StringType, true),
        StructField("nv_1000", StringType, true)       
    )
)

In [None]:
val allData = spark.read.schema(schema).json("../data/monthJSONLine/*.json")
    .select(
            $"fecha".cast(DateType), 
            $"indicativo", 
            $"p_max",
            $"glo".cast(DoubleType), 
            $"hr".cast(DoubleType), 
            $"nw_55".cast(IntegerType), 
            $"tm_min".cast(DoubleType), 
            $"ta_max", 
            $"ts_min".cast(DoubleType), 
            $"nt_30".cast(IntegerType), 
            $"n_des".cast(IntegerType), 
            $"w_racha", 
            $"np_100".cast(IntegerType), 
            $"nw_91".cast(IntegerType), 
            $"np_001".cast(IntegerType), 
            $"ta_min", 
            $"w_rec".cast(IntegerType), 
            $"e".cast(DoubleType), 
            $"np_300".cast(IntegerType), 
            $"p_mes".cast(DoubleType), 
            $"w_med".cast(DoubleType), 
            $"nt_00".cast(IntegerType), 
            $"ti_max".cast(DoubleType), 
            $"tm_mes".cast(DoubleType), 
            $"tm_max".cast(DoubleType), 
            $"np_010".cast(IntegerType)
    )

In [None]:
allData.write.format("parquet").partitionBy("indicativo").mode("overwrite").save("../data/monthParquet/")

### Leer y pasar datos diarios a Parquet

In [None]:
val schema = StructType(
    Array(
        StructField("fecha", StringType, true), 
        StructField("indicativo", StringType, true),
        StructField("nombre", StringType, true),
        StructField("provincia", StringType, true),
        StructField("altitud", StringType, true),
        StructField("tmed", StringType, true),
        StructField("prec", StringType, true),
        StructField("tmin", StringType, true),
        StructField("horatmin", StringType, true),
        StructField("tmax", StringType, true),
        StructField("horatmax", StringType, true),
        StructField("dir", StringType, true),
        StructField("velmedia", StringType, true),
        StructField("racha", StringType, true),
        StructField("horaracha", StringType, true),
        StructField("sol", StringType, true),
        StructField("presMax", StringType, true),
        StructField("horaPresMax", StringType, true),
        StructField("presMin", StringType, true),
        StructField("horaPresMin", StringType, true)
    )
)

In [None]:
val allData = spark.read.schema(schema).json("../data/dayJSONLine/*.json")
    .withColumn("fecha", $"fecha".cast(DateType))
    .withColumn("altitud", $"altitud".cast(IntegerType))
    .withColumn("tmed", func.regexp_replace($"tmed", ",", ".").cast(DoubleType))
    .withColumn("prec", func.regexp_replace($"prec", ",", ".").cast(DoubleType))
    .withColumn("tmin", func.regexp_replace($"tmin", ",", ".").cast(DoubleType))
    .withColumn("tmax", func.regexp_replace($"tmax", ",", ".").cast(DoubleType))
    .withColumn("dir", $"dir".cast(IntegerType))
    .withColumn("velmedia", func.regexp_replace($"velmedia", ",", ".").cast(DoubleType))
    .withColumn("racha", func.regexp_replace($"racha", ",", ".").cast(DoubleType))
    .withColumn("sol", func.regexp_replace($"sol", ",", ".").cast(DoubleType))
    .withColumn("presMax", func.regexp_replace($"presMax", ",", ".").cast(DoubleType))
    .withColumn("horaPresMax", $"horaPresMax".cast(IntegerType))
    .withColumn("presMin", func.regexp_replace($"presMin", ",", ".").cast(DoubleType))
    .withColumn("horaPresMin", $"horaPresmin".cast(IntegerType))

In [None]:
allData.write.format("parquet").partitionBy("indicativo").mode("overwrite").save("../data/dayParquet/")