# Task 2 DF

In [0]:
from pyspark.sql import SparkSession
from sparkmeasure import StageMetrics
from pyspark.sql import functions as F
from operator import add
import operator
from functools import reduce
from datetime import datetime
import pyspark.sql.types as types
from pyspark.sql.types import DateType, LongType
from pyspark.sql.functions import col, avg, min, max, round, count as _count


spark = SparkSession.builder.appName("task2").getOrCreate()

dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "small"
year = 2000

In [0]:
def clean_column_names(df):
  tempList = [] #Edit01
  for col in df.columns:
      new_name = col.strip()
      new_name = "".join(new_name.split())
      new_name = new_name.replace('.','') 
      tempList.append(new_name) 

  return df.toDF(*tempList) 

In [0]:
def compute_delay(time_departed, time_scheduled):
      # Assume there are no delays longer than 12 hours
      # There are no flights early by more than 12 hours
      
      # Fill the rest of the 24 hour time (e.g 15 -> 0015)
      time_departed = str(time_departed).zfill(4)
      time_scheduled = str(time_scheduled).zfill(4)
      
      # Convert hours to minutes and then add the remaining minutes 
      td_hours_to_mins = int(time_departed[:-2]) * 60
      time_departed_mins = td_hours_to_mins + int(time_departed[-2:])
      
      ts_hours_to_mins = int(time_scheduled[:-2]) * 60
      time_scheduled_mins = ts_hours_to_mins + int(time_scheduled[-2:])
      
      # Calculate Delay
      delay = time_departed_mins - time_scheduled_mins
      
      # Check if there is a delay longer than 12 hours
      if delay/60 > 12:
        return delay - 1440
      # Check there are no flights early by more than 12 hours
      elif delay/60 < -12:
        return delay + 1440
      else:
        return delay
    
def task_2_df(spark_session, flights_path, airlines_path, year): 
    flights_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path)
    airlines_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(airlines_path)
    
    flights = clean_column_names(flights_df)
    airlines = clean_column_names(airlines_df)
    df = airlines.join(flights, airlines.carrier_code == flights.carrier_code, how="inner")
    
    # Filter US only
    USA_df = df.filter(F.col("country") == "United States")
    # Change dates from string to DateType
    USA_date_df = USA_df.withColumn('flight_date', USA_df['flight_date'].cast(DateType())).drop('flight_date')
    # Filter flights only in the specified year
    USA_year = USA_date_df.filter(F.col("flight_date").between(str(year)+'-01-01',str(year)+'-12-31'))
    #df_filtered = USA_year.filter("scheduled_depature_time IS NOT NULL AND scheduled_arrival_time IS NOT NULL")

    # Set up compute delay function and specifiy return type
    compute_delay_udf = F.udf(compute_delay, types.IntegerType())
    # Compute the delay
    delay_sec = USA_year.withColumn("delay", compute_delay_udf(col("scheduled_depature_time"),col("scheduled_arrival_time"))) 
    # Filter for delayed flights only
    delayed = delay_sec.filter(F.col("delay") > 0)
    
    # Output
    output_df = delayed.groupBy("name").agg(_count("name").alias("num_delays"), round(avg("delay"),2).alias("average_delay"), min("delay").alias("min_delay"), max("delay").alias("max_delay")).orderBy("name").show()

    #output_df.write.format("com.databricks.spark.csv").option("header", "true").save("task_2_df.txt")
    #output_df.write.format("com.databricks.spark.csv").option("sep", "\t").option("header", "true").csv("/Filestore/data/dataF.csv")

In [0]:
task_2_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_{size}.csv", f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", year)