In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from sparkmeasure import StageMetrics
from pyspark.sql import functions as F
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import regexp_replace, col, broadcast
import pyspark.sql.types as types
from operator import add

spark = SparkSession.builder.appName("task1").getOrCreate()

stagemetrics = StageMetrics(spark)

dbfs_fileStore_prefix = "/FileStore/tables"
prefix = "ontimeperformance"

In [0]:
def clean_column_names(df):
  tempList = [] #Edit01
  for col in df.columns:
      new_name = col.strip()
      new_name = "".join(new_name.split())
      new_name = new_name.replace('.','') 
      tempList.append(new_name) 

  return df.toDF(*tempList) 

In [0]:
def task_1_df(spark_session, flights_path, aircrafts_path, show_results = True):
    flights_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(flights_path)
    aircrafts_df = spark.read.format("csv") \
                        .option("header", "true") \
                        .option("inferSchema", "true") \
                        .load(aircrafts_path)
    # Clean the data
    flights = clean_column_names(flights_df)
    aircraft = clean_column_names(aircrafts_df)
    
    # Project only columns that are required
    aircraft_small = aircraft.select("manufacturer", "model", "tailnum")
    flights_small = flights.select("tail_number")
    
    # Join flights and aircrafts
    df = flights_small.join(broadcast(aircraft_small), flights_small.tail_number == aircraft_small.tailnum)
    
    # Filter for CESSNA manufacturers only
    cessna_df = df.filter(F.col("manufacturer") == "CESSNA")
    
    # Format the model to include only first three digits
    formatted_df = cessna_df.withColumn("short_model", F.regexp_extract(F.col("model"), "\d{3}", 0))
    
    # Convert to list to use head func and get top 3 grouped models    
    top3_list = formatted_df.groupBy("short_model").count().orderBy("count",ascending=False).head(3)
    
    # Convert list back into a df
#     sorted_df = spark.createDataFrame(top3_list)
#     output = sorted_df.selectExpr("short_model as Model", "count as numberOfDepartingFlights")
#     output.show()
    
    # Print output in correct format
    for model, count in top3_list:
      print("Cessna {} \t {}".format(model, count))

In [0]:
stagemetrics.begin()

task_1_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_small.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

stagemetrics.end()

In [0]:
stagemetrics.begin()

task_1_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_medium.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

stagemetrics.end()

In [0]:
stagemetrics.begin()

task_1_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_large.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

stagemetrics.end()

In [0]:
stagemetrics.begin()

task_1_df(spark, f"{dbfs_fileStore_prefix}/{prefix}_flights_massive.csv", f"{dbfs_fileStore_prefix}/{prefix}_aircrafts.csv")

stagemetrics.end()