# Intro

https://stackoverflow.com/questions/39926411/provide-schema-while-reading-csv-file-as-a-dataframe-in-scala-spark

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import java.time.LocalTime
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._

// import spark.implicits._

import org.apache.spark.sql.SparkSession

val spark = SparkSession
  .builder()
  .appName("Spark SQL basic example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

import spark.sqlContext.implicits._

println("Scala language: "+util.Properties.versionString)

// spark.sparkContext.version
spark.version

Scala language: version 2.12.18


spark = org.apache.spark.sql.SparkSession@6bd8170


3.3.2

---

# Flights

In [3]:
val filePath = "gs://dataset-flight/Flights/"
val df_flights_raw = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(filePath)

println(df_flights_raw.count())
df_flights_raw.show(3)

18286055
+-------------------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+----+
|            FL_DATE|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|CRS_DEP_TIME|ARR_DELAY_NEW|CANCELLED|DIVERTED|CRS_ELAPSED_TIME|WEATHER_DELAY|NAS_DELAY|_c12|
+-------------------+---------------------+-----------------+-----------------+---------------+------------+-------------+---------+--------+----------------+-------------+---------+----+
|2013-07-01 00:00:00|                20363|             3407|            11433|          13342|        1040|          0.0|      0.0|     0.0|            79.0|         null|     null|null|
|2013-07-01 00:00:00|                20363|             3409|            11433|          12266|        1227|          0.0|      0.0|     0.0|           175.0|         null|     null|null|
|2013-07-01 00:00:00|                20363|        

filePath = gs://dataset-flight/Flights/
df_flights_raw = [FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]


[FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 11 more fields]

486133 -> 18286055 pour toutes les csv

preprocessing

In [4]:
val delayThreshold = 30

val df_flights_clean = df_flights_raw
    .drop("_c12") // 478k
    .where(col("CANCELLED") === 0).drop("CANCELLED")
    .where(col("DIVERTED") === 0).drop("DIVERTED") // 478k
    .na.fill(0, Seq("NAS_DELAY"))
    .withColumn("NAS_DELAY", col("NAS_DELAY").cast("int"))
    .dropDuplicates(Seq("FL_DATE", "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM", "ORIGIN_AIRPORT_ID")) // 437k
    .withColumn("CRS_ELAPSED_TIME", col("CRS_ELAPSED_TIME").cast("int")) // float -> int
    .withColumn("ARR_DELAY_NEW", col("ARR_DELAY_NEW").cast("int"))
    .where(!(col("ARR_DELAY_NEW") < -45)) // n'arrive jamais 45min en avance
    .where(col("ARR_DELAY_NEW") < (5.3 * 60)) // 437k
    .withColumn("IS_DELAYED", when(col("ARR_DELAY_NEW") >= delayThreshold and (col("NAS_DELAY") >= delayThreshold or col("WEATHER_DELAY") > 0), 1).otherwise(0))

println(df_flights_clean.count())

df_flights_clean.show(3)

17920326
+-------------------+---------------------+-----------------+-----------------+---------------+------------+-------------+----------------+-------------+---------+----------+
|            FL_DATE|OP_CARRIER_AIRLINE_ID|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|DEST_AIRPORT_ID|CRS_DEP_TIME|ARR_DELAY_NEW|CRS_ELAPSED_TIME|WEATHER_DELAY|NAS_DELAY|IS_DELAYED|
+-------------------+---------------------+-----------------+-----------------+---------------+------------+-------------+----------------+-------------+---------+----------+
|2013-03-01 00:00:00|                20363|             4114|            11433|          11423|        1625|            0|             116|         null|        0|         0|
|2013-03-01 00:00:00|                20363|             4255|            11042|          11433|        1015|            0|              65|         null|        0|         0|
|2013-03-01 00:00:00|                19790|             1878|            12217|          10397|         630|        

delayThreshold = 30
df_flights_clean = [FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 9 more fields]


[FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 9 more fields]

In [5]:
df_flights_clean.select("IS_DELAYED")
    .summary("count", "approx_count_distinct", "mean", "stddev", "min", "25%", "50%", "75%", "95%", "max").show()

+--------------------+--------------------+
|             summary|          IS_DELAYED|
+--------------------+--------------------+
|               count|            17920326|
|approx_count_dist...|                   2|
|                mean|0.030714340799380548|
|              stddev| 0.17254266640438876|
|                 min|                   0|
|                 25%|                   0|
|                 50%|                   0|
|                 75%|                   0|
|                 95%|                   0|
|                 max|                   1|
+--------------------+--------------------+



In [6]:
df_flights_raw.select("WEATHER_DELAY")
    .summary("count", "approx_count_distinct", "mean", "stddev", "min", "25%", "50%", "75%", "95%", "max").show()

+--------------------+-----------------+
|             summary|    WEATHER_DELAY|
+--------------------+-----------------+
|               count|          3524963|
|approx_count_dist...|              836|
|                mean|2.338359863635448|
|              stddev|17.56329940409019|
|                 min|              0.0|
|                 25%|              0.0|
|                 50%|              0.0|
|                 75%|              0.0|
|                 95%|              3.0|
|                 max|           1615.0|
+--------------------+-----------------+



In [7]:
df_flights_clean.printSchema()

root
 |-- FL_DATE: timestamp (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: integer (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: integer (nullable = true)
 |-- IS_DELAYED: integer (nullable = false)



In [6]:
df_flights_clean.printSchema()

root
 |-- FL_DATE: timestamp (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- ARR_DELAY_NEW: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: integer (nullable = true)
 |-- WEATHER_DELAY: double (nullable = true)
 |-- NAS_DELAY: integer (nullable = true)



nas_delay = la grande majorité à NULL sinon 0:

In [8]:
df_flights_raw.select("NAS_DELAY")
    .summary("count", "approx_count_distinct", "mean", "stddev", "min", "25%", "50%", "75%", "max").show()

+--------------------+------------------+
|             summary|         NAS_DELAY|
+--------------------+------------------+
|               count|           3524963|
|approx_count_dist...|               714|
|                mean|13.269710065041817|
|              stddev| 26.91482091789664|
|                 min|               0.0|
|                 25%|               0.0|
|                 50%|               3.0|
|                 75%|              17.0|
|                 max|            1439.0|
+--------------------+------------------+



---

une fois les vols filtrés (cancelled, diverted...), statistiques en heures:

In [8]:
val df_temp = df_flights_raw
    .drop("_c12") // 478k
    .where(col("CANCELLED") === 0).drop("CANCELLED")
    .where(col("DIVERTED") === 0).drop("DIVERTED") // 478k
    .where(col("NAS_DELAY").isNull || col("NAS_DELAY") === 0).drop("NAS_DELAY") // 437k
    .dropDuplicates(Seq("FL_DATE", "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM", "ORIGIN_AIRPORT_ID")) // 437k
    .withColumn("CRS_ELAPSED_TIME", col("CRS_ELAPSED_TIME").cast("int")) // float -> int
    .withColumn("ARR_DELAY_NEW", col("ARR_DELAY_NEW").cast("int"))
    .where(!(col("ARR_DELAY_NEW") < -45)) // n'arrive jamais 45min en avance

df_temp
    .select("CRS_DEP_TIME", "CRS_ELAPSED_TIME", "ARR_DELAY_NEW", "WEATHER_DELAY")
    .summary("mean", "stddev", "min", "25%", "50%", "75%", "90%", "95%", "max")
    .withColumn("CRS_DEP_TIME", round(col("CRS_DEP_TIME") / 60, 1))
    .withColumn("CRS_ELAPSED_TIME", round(col("CRS_ELAPSED_TIME") / 60, 1))
    .withColumn("ARR_DELAY_NEW", round(col("ARR_DELAY_NEW") / 60, 1))
    .withColumn("WEATHER_DELAY", round(col("WEATHER_DELAY") / 60, 1))
    .show()

+-------+------------+----------------+-------------+-------------+
|summary|CRS_DEP_TIME|CRS_ELAPSED_TIME|ARR_DELAY_NEW|WEATHER_DELAY|
+-------+------------+----------------+-------------+-------------+
|   mean|        22.0|             2.3|          0.1|          0.0|
| stddev|         7.7|             1.2|          0.4|          0.4|
|    min|         0.1|             0.4|          0.0|          0.0|
|    25%|        15.4|             1.4|          0.0|          0.0|
|    50%|        21.9|             2.0|          0.0|          0.0|
|    75%|        28.6|             2.8|          0.0|          0.0|
|    90%|        32.4|             3.9|          0.2|          0.0|
|    95%|        34.3|             4.8|          0.4|          0.2|
|    max|        39.3|            11.1|         24.2|         19.7|
+-------+------------+----------------+-------------+-------------+



df_temp = [FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 7 more fields]


[FL_DATE: timestamp, OP_CARRIER_AIRLINE_ID: int ... 7 more fields]

- 25% des vols sont programmés entre 00H00 et 15H00
- 25% des vols sont programmés entre 15H00 et 22H00
- 25% des vols sont programmés entre 22H00 et 04H00 du matin le lendemain

concernant la durée de vols:
- 5% des vols ont une durée prévisionelle de plus de 5h, le maximum étant une durée prévisionnelle de 11h.

pour ce qui est des retards:
- 75% des vols sont a l'heure (ARR_DELAY à zero)
- la moyenne des retard est de 6min (0.1h)
- 90% des vols ont moins de 12 min de retard

concernant la ponctualité au départ ou a l'arrivée: celles ci sont de l'ordre de 80%:
https://www.lefigaro.fr/voyages/conseils/ponctualite-des-avions-les-aeroports-francais-sont-toujours-a-la-traine-20230728

sachant que la durée des vols ne peut pas dépasser de plus de 3 ou 4h (pour les vols courts) et 5 ou 6h (pour les vols longs) pour des questions de réserve de carburant limitées, les ARR_DELAY_NEW de plus de 5h doivent forcément soit être du à des départs repoussés ou des valeurs erronées. Or nous ne connaissons pas de combien de temps le décollage a été repoussés, mais ils y en a forcément vu que des CRS_DEP_TIME dépassent les 24h ! __Supprimons les arrival delay et weather delay trop importants de respectivement plus de 2.4 et 2.2h. --> OUTLIERS __

In [11]:
df_temp
    .select("ARR_DELAY_NEW", "WEATHER_DELAY")
    .summary("90%", "95%", "97%", "99%", "99.5%", "99.95%", "max")
    .withColumn("ARR_DELAY_NEW", round(col("ARR_DELAY_NEW") / 60, 1))
    .withColumn("WEATHER_DELAY", round(col("WEATHER_DELAY") / 60, 1))
    .show()

+-------+-------------+-------------+
|summary|ARR_DELAY_NEW|WEATHER_DELAY|
+-------+-------------+-------------+
|    90%|          0.2|          0.0|
|    95%|          0.4|          0.2|
|    97%|          0.8|          0.4|
|    99%|          1.7|          1.5|
|  99.5%|          2.4|          2.2|
| 99.95%|          5.3|          4.4|
|    max|         24.2|         19.7|
+-------+-------------+-------------+



---

# Weather

In [3]:
val weatherPath = "gs://dataset-flight/Weather/201201hourly.txt"

val weatherSchema = StructType(Array(    
    StructField("WBAN", IntegerType, true),
    StructField("Date", IntegerType, true),
    StructField("Time", IntegerType, true),
    StructField("StationType", IntegerType, true),
    StructField("SkyCondition", StringType, true),
    StructField("SkyConditionFlag", StringType, true),
    StructField("Visibility", StringType, true), // enlever espace puis caster en int
    StructField("VisibilityFlag", StringType, true),
    StructField("WeatherType", StringType, true),
    StructField("WeatherTypeFlag", StringType, true),
    StructField("DryBulbFarenheit", IntegerType, true),
    StructField("DryBulbFarenheitFlag", StringType, true),
    StructField("DryBulbCelsius", FloatType, true),
    StructField("DryBulbCelsiusFlag", StringType, true),
    StructField("WetBulbFarenheit", IntegerType, true),
    StructField("WetBulbFarenheitFlag", StringType, true),
    StructField("WetBulbCelsius", FloatType, true),
    StructField("WetBulbCelsiusFlag", StringType, true),
    StructField("DewPointFarenheit", IntegerType, true),
    StructField("DewPointFarenheitFlag", StringType, true),
    StructField("DewPointCelsius", FloatType, true),
    StructField("DewPointCelsiusFlag", StringType, true),
    StructField("RelativeHumidity", StringType, true), // a convertir en int après space
    StructField("RelativeHumidityFlag", StringType, true),
    StructField("WindSpeed", IntegerType, true),
    StructField("WindSpeedFlag", StringType, true),
    StructField("WindDirection", IntegerType, true),
    StructField("WindDirectionFlag", StringType, true),
    StructField("ValueForWindCharacter", IntegerType, true),
    StructField("ValueForWindCharacterFlag", StringType, true),
    StructField("StationPressure", FloatType, true),
    StructField("StationPressureFlag", StringType, true),
    StructField("PressureTendency", IntegerType, true),
    StructField("PressureTendencyFlag", StringType, true),
    StructField("PressureChange", IntegerType, true),
    StructField("PressureChangeFlag", StringType, true),
    StructField("SeaLevelPressure", FloatType, true),
    StructField("SeaLevelPressureFlag", StringType, true),
    StructField("RecordType", StringType, true),
    StructField("RecordTypeFlag", StringType, true),
    StructField("HourlyPrecip", FloatType, true),
    StructField("HourlyPrecipFlag", StringType, true),
    StructField("Altimeter", FloatType, true),
    StructField("AltimeterFlag", StringType, true),
))


val df_weather_raw = spark.read.format("csv")
    .option("header", "true")
    .option("delimited", ",")
    .schema(weatherSchema)
    .option("treatEmptyValuesAsNulls","true")
    .option("nullValue", null)
    .option("emptyValue", null)
    .load(weatherPath)


println(df_weather_raw.count())
println(df_weather_raw.columns.size)
// df_weather_raw.printSchema()

4192912
44


weatherPath = gs://dataset-flight/Weather/201201hourly.txt
weatherSchema = StructType(StructField(WBAN,IntegerType,true),StructField(Date,IntegerType,true),StructField(Time,IntegerType,true),StructField(StationType,IntegerType,true),StructField(SkyCondition,StringType,true),StructField(SkyConditionFlag,StringType,true),StructField(Visibility,StringType,true),StructField(VisibilityFlag,StringType,true),StructField(WeatherType,StringType,true),StructField(WeatherTypeFlag,StringType,true),StructField(DryBulbFarenheit,IntegerType,true),StructField(DryBulbFarenheitFlag,StringType,true),StructField(DryBulbCelsius,FloatType,true),StructField(DryBulbCelsiusFlag,StringType,true),StructField(WetBulbFarenheit,IntegerType,true),StructField(WetBulbFaren...


StructType(StructField(WBAN,IntegerType,true),StructField(Date,IntegerType,true),StructField(Time,IntegerType,true),StructField(StationType,IntegerType,true),StructField(SkyCondition,StringType,true),StructField(SkyConditionFlag,StringType,true),StructField(Visibility,StringType,true),StructField(VisibilityFlag,StringType,true),StructField(WeatherType,StringType,true),StructField(WeatherTypeFlag,StringType,true),StructField(DryBulbFarenheit,IntegerType,true),StructField(DryBulbFarenheitFlag,StringType,true),StructField(DryBulbCelsius,FloatType,true),StructField(DryBulbCelsiusFlag,StringType,true),StructField(WetBulbFarenheit,IntegerType,true),StructField(WetBulbFaren...

In [None]:
    // .drop("_c12") // 478k
    // .where(col("CANCELLED") === 0).drop("CANCELLED")
    // .where(col("DIVERTED") === 0).drop("DIVERTED") // 478k
    // .where(col("NAS_DELAY").isNull || col("NAS_DELAY") === 0).drop("NAS_DELAY") // 437k
    // .dropDuplicates(Seq("FL_DATE", "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM", "ORIGIN_AIRPORT_ID")) // 437k
    // .withColumn("CRS_ELAPSED_TIME", col("CRS_ELAPSED_TIME").cast("int")) // float -> int
    // .withColumn("ARR_DELAY_NEW", col("ARR_DELAY_NEW").cast("int"))
    // .where(!(col("ARR_DELAY_NEW") < -45)) // n'arrive jamais 45min en avance
    // .where(col("ARR_DELAY_NEW") < (5.3 * 60)) // 437k
    // // .where(col("WEATHER_DELAY") < (4.4 * 60)) // supprimer trop de ligne bizarrement ??

In [4]:
val df_weather_clean = df_weather_raw
    .drop(df_weather_raw.columns.filter(colName => colName.endsWith("Flag")) : _*)
    .withColumn("Visibility", trim(col("Visibility")).cast("int"))
    .withColumn("RelativeHumidity", trim(col("RelativeHumidity")).cast("int"))


println(df_weather_clean.count())
println(df_weather_clean.columns.size)

4192912
24


df_weather_clean = [WBAN: int, Date: int ... 22 more fields]


[WBAN: int, Date: int ... 22 more fields]

In [62]:
println("Visibility: "+df_weather_clean.select("Visibility").distinct().take(10).mkString(", "))
println("RelativeHumidity: "+df_weather_clean.select("RelativeHumidity").distinct().take(10).mkString(", "))

// for (columnName <- df_weather_raw.columns) {
//   println("10 of the disctinct values for " + columnName + ": " + df_weather_raw.select(columnName).distinct().take(10).mkString(", "))
// }

Visibility: [12], [null], [1], [13], [6], [3], [20], [40], [5], [19]
RelativeHumidity: [31], [85], [65], [53], [78], [34], [81], [28], [76], [27]


In [6]:
for (columnName <- df_weather_clean.columns) {
  println("10 of the disctinct values for " + columnName + ": " + df_weather_clean.select(columnName).distinct().take(10).mkString(", "))
}

10 of the disctinct values for WBAN: [3749], [3918], [3997], [4935], [3179], [4929], [3704], [3761], [3089], [3098]
10 of the disctinct values for Date: [20120122], [20120127], [20120109], [20120103], [20120112], [20120123], [20120114], [20120108], [20120129], [20120131]
10 of the disctinct values for Time: [833], [1645], [1238], [2122], [148], [1959], [1829], [1342], [2142], [2235]
10 of the disctinct values for StationType: [12], [5], [15], [11], [0], [6]
10 of the disctinct values for SkyCondition: [BKN120], [BKN050], [BKN013 BKN023 OVC030], [FEW090 SCT120], [BKN025 BKN031 OVC037], [FEW014 BKN023 OVC110], [SCT028 BKN047 OVC070], [FEW040 BKN120], [FEW022 OVC030], [BKN001 BKN006 OVC015]
10 of the disctinct values for Visibility: [85], [12], [null], [1], [13], [6], [3], [20], [40], [5]
10 of the disctinct values for WeatherType: [DZ], [-RASN BR], [UP FZFG], [+RA FG], [-SHRA BR], [VCTS -FZRA BR], [-RA SQ], [TSDZ], [-SNPL], [-FZRA SNPL BR]
10 of the disctinct values for DryBulbFarenheit:

In [8]:
val nbRows = df_weather_clean.count()

for (columnName <- df_weather_clean.columns) {
    println("% nulls " + columnName + ": " + (df_weather_clean.filter(col(columnName).isNull || col(columnName) === "" || col(columnName) === " " || col(columnName) === "null").count() * 100.0 / nbRows)
}

nb nulls WBAN: 0
nb nulls Date: 0
nb nulls Time: 0
nb nulls StationType: 0
nb nulls SkyCondition: 0
nb nulls Visibility: 1831790
nb nulls WeatherType: 3728635
nb nulls DryBulbFarenheit: 138433
nb nulls DryBulbCelsius: 24007
nb nulls WetBulbFarenheit: 1895363
nb nulls WetBulbCelsius: 1895363
nb nulls DewPointFarenheit: 1933020
nb nulls DewPointCelsius: 1858712
nb nulls RelativeHumidity: 1895363
nb nulls WindSpeed: 3355773
nb nulls WindDirection: 1803373
nb nulls ValueForWindCharacter: 3728936
nb nulls StationPressure: 1833248
nb nulls PressureTendency: 3904451
nb nulls PressureChange: 3904451
nb nulls SeaLevelPressure: 3374492
nb nulls RecordType: 0
nb nulls HourlyPrecip: 4096684
nb nulls Altimeter: 1805797


In [10]:
val nbRows = df_weather_clean.count()

for (columnName <- df_weather_clean.columns) {
    println("% nulls " + columnName + ": " + (df_weather_clean.filter(col(columnName).isNull || col(columnName) === "" || col(columnName) === " " || col(columnName) === "null").count() * 100.0) / nbRows)
}

% nulls WBAN: 0.0
% nulls Date: 0.0
% nulls Time: 0.0
% nulls StationType: 0.0
% nulls SkyCondition: 0.0
% nulls Visibility: 43.68777594187524
% nulls WeatherType: 88.92709887543549
% nulls DryBulbFarenheit: 3.3015956452222226
% nulls DryBulbCelsius: 0.572561503794976
% nulls WetBulbFarenheit: 45.20397756976536
% nulls WetBulbCelsius: 45.20397756976536
% nulls DewPointFarenheit: 46.102088476934405
% nulls DewPointCelsius: 44.32985953437611
% nulls RelativeHumidity: 45.20397756976536
% nulls WindSpeed: 80.03442476255165
% nulls WindDirection: 43.01003693852864
% nulls ValueForWindCharacter: 88.93427765715093
% nulls StationPressure: 43.72254891111476
% nulls PressureTendency: 93.12027058998615
% nulls PressureChange: 93.12027058998615
% nulls SeaLevelPressure: 80.48086866597725
% nulls RecordType: 0.0
% nulls HourlyPrecip: 97.70498403019191
% nulls Altimeter: 43.06784878862232


nbRows = 4192912


4192912

---

# Backup

In [None]:
import spark.sqlContext.implicits._
import org.apache.spark.sql.functions._
val test = Seq(("2019-01-23"),("2019-06-24"),("2019-09-20")).toDF("date")

test.show()

import java.sql.Timestamp

val df = Seq(
    ("notebook",    Timestamp.valueOf("2019-01-29 12:00:00"), 2),
    ("notebook",    Timestamp.valueOf("2019-01-01 00:00:00"), 3),
    ("small_phone", Timestamp.valueOf("2019-01-15 23:00:00"), 4),
    ("small_phone", Timestamp.valueOf("2019-01-01 09:00:00"), 5)
).toDF("device", "purchase_time", "minutes").sort("device","purchase_time")

df.show()

df.withColumn("new", to_timestamp(col("purchase_time"))).show()

val testbis = test.withColumn("date", to_timestamp(col("date"), "yyyy-MM-dd")) // HH:mm:ss
testbis.show()

testbis.printSchema()

import org.apache.spark.sql.functions.{add_months, date_add, _}


testbis.select(col("date"), add_months(col("date"), 3).as("add_months")).show()

import spark.sqlContext.implicits._
import org.apache.spark.sql.functions._

testbis.add_months(lit(3)).show()


df.select(col("date"),
    add_months(col("date"),3).as("add_months"), // provide +ve value to add months
    add_months(col("date"),-3).as("sub_months"), //provide -ve value to subtract months
    date_add(col("date"),4).as("date_add"), // to add day
    date_sub(col("date"),4).as("date_sub") //to substract day
  ).show()

import org.apache.spark.sql.functions._

df_flights_clean
    .withColumn("FL_DATE_TIME", col("FL_DATE")  + (col("CRS_DEP_TIME") * 60))
// df = df.withColumn("timestamp", F.expr("from_unixtime(unix_timestamp(concat_ws(' ', date, time)) + (`additional_time(in mins)` * 60))"))

    .show()

spark.sql("select input_timestamp, " +
    "cast(input_timestamp as TIMESTAMP) + INTERVAL 2 hours as added_hours," +
    "cast(input_timestamp as TIMESTAMP) + INTERVAL 5 minutes as added_minutes," +
    "cast(input_timestamp as TIMESTAMP) + INTERVAL 55 seconds as added_seconds from AddTimeExample"
    )