# Intro

https://stackoverflow.com/questions/39926411/provide-schema-while-reading-csv-file-as-a-dataframe-in-scala-spark

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import java.time.LocalTime
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._

// import spark.implicits._

import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder()
  .appName("Spark SQL basic example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

import spark.sqlContext.implicits._

println("Scala language: "+util.Properties.versionString)

// Use the Cloud Storage bucket for temporary BigQuery export data used
// by the connector.
val bucket = "dataproc-temp-us-central1-1044206227610-i54vpwyj"
spark.conf.set("temporaryGcsBucket", bucket)

// spark.sparkContext.version
spark.version

Scala language: version 2.12.18


spark = org.apache.spark.sql.SparkSession@548aecd
bucket = dataproc-temp-us-central1-1044206227610-i54vpwyj


3.3.2

---

# Flights

In [30]:
val filePath = "gs://dataset-flight/Flights/"
val df_flights_raw = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(filePath)

[FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]

In [31]:
val delayThreshold = 30

val df_flights_clean = df_flights_raw

    // filter useless cols & rows
    .drop("_c12")
    .where(col("CANCELLED") === 0).drop("CANCELLED")
    .where(col("DIVERTED") === 0).drop("DIVERTED")
    .dropDuplicates(Seq("FL_DATE", "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM", "ORIGIN_AIRPORT_ID"))

    // get Flight date time with hours & minutes 
    .withColumn("PAD_CRS_DEP_TIME", lpad(col("CRS_DEP_TIME"), 4, "0"))
    .withColumn("FL_DATETIME", to_timestamp(concat(split(col("FL_DATE")," ").getItem(0), lit(' '), col("PAD_CRS_DEP_TIME")),"yyyy-MM-dd HHmm"))
    .drop("PAD_CRS_DEP_TIME", "FL_DATE", "CRS_DEP_TIME")

    // change other types
    .na.fill(0, Seq("NAS_DELAY", "WEATHER_DELAY"))
    .withColumn("NAS_DELAY", col("NAS_DELAY").cast("int"))
    .withColumn("WEATHER_DELAY", col("WEATHER_DELAY").cast("int"))
    .withColumn("CRS_ELAPSED_TIME", col("CRS_ELAPSED_TIME").cast("int"))
    .withColumn("ARR_DELAY_NEW", col("ARR_DELAY_NEW").cast("int"))

    // filter inconsistent values
    .where(!(col("ARR_DELAY_NEW") < -45))
    .where(col("ARR_DELAY_NEW") < (5.3 * 60))

    // create target
    .withColumn("IS_DELAYED", when(col("ARR_DELAY_NEW") >= delayThreshold and (col("NAS_DELAY") >= delayThreshold or col("WEATHER_DELAY") > 0), 1).otherwise(0))

println(df_flights_clean.count())
df_flights_clean.show(3)

17920326
+---------------------+-----------------+-----------------+---------------+-------------+----------------+-------------+---------+-------------------+----------+
|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|ARR_DELAY_NEW|CRS_ELAPSED_TIME|WEATHER_DELAY|NAS_DELAY|        FL_DATETIME|IS_DELAYED|
+---------------------+-----------------+-----------------+---------------+-------------+----------------+-------------+---------+-------------------+----------+
|                20366|             2543|            11298|          10728|            0|              70|            0|        0|2013-07-01 08:30:00|         0|
|                20366|             6046|            10693|          12266|            0|             121|            0|        0|2013-07-01 12:42:00|         0|
|                20436|              601|            15412|          11292|           42|             181|            0|       12|2013-07-01 18:04:00|         0|
+------------------

delayThreshold = 30
df_flights_clean = [OP_CARRIER_AIRLINE_ID: int, OP_CARRIER_FL_NUM: int ... 8 more fields]


[OP_CARRIER_AIRLINE_ID: int, OP_CARRIER_FL_NUM: int ... 8 more fields]

In [32]:
df_flights_clean.write.format("bigquery")
  .option("table","silver.flights")
  .save()

---

# Weather

In [2]:
val weatherPath = "gs://dataset-flight/Weather/"

val weatherSchema = StructType(Array(    
    StructField("WBAN", IntegerType, true),
    StructField("Date", IntegerType, true),
    StructField("Time", IntegerType, true),
    StructField("StationType", IntegerType, true),
    StructField("SkyCondition", StringType, true),
    StructField("SkyConditionFlag", StringType, true),
    StructField("Visibility", StringType, true), // enlever espace puis caster en int
    StructField("VisibilityFlag", StringType, true),
    StructField("WeatherType", StringType, true),
    StructField("WeatherTypeFlag", StringType, true),
    StructField("DryBulbFarenheit", IntegerType, true),
    StructField("DryBulbFarenheitFlag", StringType, true),
    StructField("DryBulbCelsius", FloatType, true),
    StructField("DryBulbCelsiusFlag", StringType, true),
    StructField("WetBulbFarenheit", IntegerType, true),
    StructField("WetBulbFarenheitFlag", StringType, true),
    StructField("WetBulbCelsius", FloatType, true),
    StructField("WetBulbCelsiusFlag", StringType, true),
    StructField("DewPointFarenheit", IntegerType, true),
    StructField("DewPointFarenheitFlag", StringType, true),
    StructField("DewPointCelsius", FloatType, true),
    StructField("DewPointCelsiusFlag", StringType, true),
    StructField("RelativeHumidity", StringType, true), // a convertir en int après space
    StructField("RelativeHumidityFlag", StringType, true),
    StructField("WindSpeed", IntegerType, true),
    StructField("WindSpeedFlag", StringType, true),
    StructField("WindDirection", IntegerType, true),
    StructField("WindDirectionFlag", StringType, true),
    StructField("ValueForWindCharacter", IntegerType, true),
    StructField("ValueForWindCharacterFlag", StringType, true),
    StructField("StationPressure", FloatType, true),
    StructField("StationPressureFlag", StringType, true),
    StructField("PressureTendency", IntegerType, true),
    StructField("PressureTendencyFlag", StringType, true),
    StructField("PressureChange", IntegerType, true),
    StructField("PressureChangeFlag", StringType, true),
    StructField("SeaLevelPressure", FloatType, true),
    StructField("SeaLevelPressureFlag", StringType, true),
    StructField("RecordType", StringType, true),
    StructField("RecordTypeFlag", StringType, true),
    StructField("HourlyPrecip", FloatType, true),
    StructField("HourlyPrecipFlag", StringType, true),
    StructField("Altimeter", FloatType, true),
    StructField("AltimeterFlag", StringType, true),
))


val df_weather_raw = spark.read.format("csv")
    .option("header", "true")
    .option("delimited", ",")
    .schema(weatherSchema)
    .option("treatEmptyValuesAsNulls","true")
    .option("nullValue", null)
    .option("emptyValue", null)
    .load(weatherPath)

weatherPath = gs://dataset-flight/Weather/
weatherSchema = StructType(StructField(WBAN,IntegerType,true),StructField(Date,IntegerType,true),StructField(Time,IntegerType,true),StructField(StationType,IntegerType,true),StructField(SkyCondition,StringType,true),StructField(SkyConditionFlag,StringType,true),StructField(Visibility,StringType,true),StructField(VisibilityFlag,StringType,true),StructField(WeatherType,StringType,true),StructField(WeatherTypeFlag,StringType,true),StructField(DryBulbFarenheit,IntegerType,true),StructField(DryBulbFarenheitFlag,StringType,true),StructField(DryBulbCelsius,FloatType,true),StructField(DryBulbCelsiusFlag,StringType,true),StructField(WetBulbFarenheit,IntegerType,true),StructField(WetBulbFarenheitFlag,StringT...


StructType(StructField(WBAN,IntegerType,true),StructField(Date,IntegerType,true),StructField(Time,IntegerType,true),StructField(StationType,IntegerType,true),StructField(SkyCondition,StringType,true),StructField(SkyConditionFlag,StringType,true),StructField(Visibility,StringType,true),StructField(VisibilityFlag,StringType,true),StructField(WeatherType,StringType,true),StructField(WeatherTypeFlag,StringType,true),StructField(DryBulbFarenheit,IntegerType,true),StructField(DryBulbFarenheitFlag,StringType,true),StructField(DryBulbCelsius,FloatType,true),StructField(DryBulbCelsiusFlag,StringType,true),StructField(WetBulbFarenheit,IntegerType,true),StructField(WetBulbFarenheitFlag,StringT...

In [3]:
println(df_weather_raw.count())
println(df_weather_raw.columns.size)

32631312
44


In [4]:
val df_weather_clean = df_weather_raw
    .drop(df_weather_raw.columns.filter(colName => colName.endsWith("Flag")) : _*)
    .withColumn("Visibility", trim(col("Visibility")).cast("int"))
    .withColumn("WindSpeed", trim(col("WindSpeed")).cast("int"))
    .withColumn("RelativeHumidity", trim(col("RelativeHumidity")).cast("int"))


println(df_weather_clean.count())
println(df_weather_clean.columns.size)

32631312
24


df_weather_clean = [WBAN: int, Date: int ... 22 more fields]


[WBAN: int, Date: int ... 22 more fields]

In [5]:
df_weather_clean.write.format("bigquery")
  .option("table","silver.weather")
  .save()

In [11]:
df_weather_clean.select("WBAN", "Date", "Time", "StationType", "SkyCondition", "Visibility", "WeatherType", "DryBulbFarenheit", "DryBulbCelsius", "WetBulbFarenheit" ,"WetBulbCelsius", "DewPointFarenheit", "DewPointCelsius")show(3)

+----+--------+----+-----------+------------+----------+-----------+----------------+--------------+----------------+--------------+-----------------+---------------+
|WBAN|    Date|Time|StationType|SkyCondition|Visibility|WeatherType|DryBulbFarenheit|DryBulbCelsius|WetBulbFarenheit|WetBulbCelsius|DewPointFarenheit|DewPointCelsius|
+----+--------+----+-----------+------------+----------+-----------+----------------+--------------+----------------+--------------+-----------------+---------------+
|3011|20120101|  15|          0|         CLR|        10|           |              23|          -5.0|              15|          -9.5|               -9|          -23.0|
|3011|20120101|  35|          0|         CLR|        10|           |              21|          -6.0|              14|         -10.2|               -9|          -23.0|
|3011|20120101|  55|          0|         CLR|        10|           |              21|          -6.0|              13|         -10.5|             null|          -25.0

In [12]:
df_weather_clean.select("RelativeHumidity", "WindSpeed", "WindDirection", "ValueForWindCharacter", "StationPressure", "PressureTendency", "PressureChange", "SeaLevelPressure", "RecordType", "HourlyPrecip", "Altimeter")show(3)

+----------------+---------+-------------+---------------------+---------------+----------------+--------------+----------------+----------+------------+---------+
|RelativeHumidity|WindSpeed|WindDirection|ValueForWindCharacter|StationPressure|PressureTendency|PressureChange|SeaLevelPressure|RecordType|HourlyPrecip|Altimeter|
+----------------+---------+-------------+---------------------+---------------+----------------+--------------+----------------+----------+------------+---------+
|              24|     null|          120|                 null|           21.7|            null|          null|            null|        AA|        null|    30.43|
|              26|     null|          130|                 null|           21.7|            null|          null|            null|        AA|        null|    30.43|
|              21|     null|            0|                 null|          21.71|            null|          null|            null|        AA|        null|    30.44|
+---------------

---
WBAN

In [13]:
val airportsPath = "gs://dataset-flight/wban_airport_timezone.csv"
val df_airports = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(airportsPath)

df_airports.show(5)

+---------+-----+--------+
|AirportID| WBAN|TimeZone|
+---------+-----+--------+
|    10685|54831|      -6|
|    14871|24232|      -8|
|    10620|24033|      -7|
|    14747|24233|      -8|
|    11252|12834|      -5|
+---------+-----+--------+
only showing top 5 rows



airportsPath = gs://dataset-flight/wban_airport_timezone.csv
df_airports = [AirportID: int, WBAN: int ... 1 more field]


[AirportID: int, WBAN: int ... 1 more field]

In [14]:
df_airports.printSchema()

root
 |-- AirportID: integer (nullable = true)
 |-- WBAN: integer (nullable = true)
 |-- TimeZone: integer (nullable = true)

