In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from sparkmeasure import StageMetrics
from pyspark.sql import functions as F
from operator import add
import operator
from pyspark.sql.functions import broadcast
from pyspark.sql.types import StringType
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from operator import add
import re

spark = SparkSession.builder.appName("task3").getOrCreate()

stagemetrics = StageMetrics(spark)
dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"
size = "small"

In [0]:
# https://stackoverflow.com/questions/55453101/pyspark-error-analysisexception-cannot-resolve-column-name
def clean_column_names(df):
  tempList = [] #Edit01
  for col in df.columns:
      new_name = col.strip()
      new_name = "".join(new_name.split())
      new_name = new_name.replace('.','') 
      tempList.append(new_name) 

  return df.toDF(*tempList) 

In [0]:
def print_models_df(airlines):
    if len(airlines) == 0: return
    i = 0
    curr = airlines[0][0]
    has_print = False
    aircraft_type = ''
    while i < len(airlines):
        if curr == airlines[i][0]:
            aircraft_type += airlines[i][1] + " " + airlines[i][2] + ", "
            i += 1
        else:
            has_print = True
            print(curr + " \t " +  "[" + aircraft_type[:-2] + "]")
            curr = airlines[i][0]
            aircraft_type = ""
    if not has_print: print(curr + " \t " + "[" + aircraft_type[:-2] + "]")

def task_3_df(spark_session, flights_path, airlines_path, aircrafts_path, country):
    flights_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path)
    airlines_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(airlines_path)
    aircrafts_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(aircrafts_path)
    # Clean
    flights_df = clean_column_names(flights_df)
    airlines_df = clean_column_names(airlines_df)
    aircraft_df = clean_column_names(aircrafts_df)
    
    # Join the Airline and Flight datasets
    airlines_count = (airlines_df.join(flights_df, airlines_df.carrier_code == flights_df.carrier_code)
                      .drop(airlines_df.carrier_code).filter(F.col("country") == country)
                      .groupBy("carrier_code", "name", "country", "tail_number").count())
    
    # Format the model name
    aircraft_model = (aircraft_df.withColumn("model_name", F.regexp_extract(F.col("model"), "[A-Z]*-?\d{1,3}", 0)))
    
    # Join the airline count and aircraft datasets
    airlines_count = (airlines_count.join(aircraft_model, F.broadcast(airlines_count).tail_number == aircraft_model.tailnum, how="inner")
                      .groupBy("name", "manufacturer", "model_name").sum("count"))
    
    # Filter for NULL
    airlines_count = airlines_count.filter(F.col("model_name").isNotNull()).filter(F.col("manufacturer").isNotNull())
    
    # Define window_spec to orderby sum of count
    window_spec = Window.partitionBy(airlines_count["name"]).orderBy(airlines_count["sum(count)"].desc())
    
    # Rank the airlines
    airlines = (airlines_count.select("*", F.rank().over(window_spec).alias("aircraft_rank"))
                .filter(F.col("aircraft_rank") < 6).sort(["name", "aircraft_rank"]))
    
    # Print output
    print_models_df(airlines.collect())


In [0]:
stagemetrics.begin()

task_3_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_small.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv", "United States")
stagemetrics.end()

In [0]:
stagemetrics.begin()

task_3_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_medium.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv", "United States")
stagemetrics.end()

In [0]:
stagemetrics.begin()

task_3_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_large.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv", "United States")
stagemetrics.end()

In [0]:
stagemetrics.begin()

task_3_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_massive.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_airlines.csv", 
                                f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv", "United States")
stagemetrics.end()