# Advanced Flight Data Analysis Questions
This notebook contains mid to advanced level questions for flight data analysis using PySpark. Please provide your solutions in the code cells following each question.

In [None]:
# import and spark session creation cell
import sys
sys.path.append('/home/aman/programs/gitrepos/PySpark/flight_data_analysis')
from datalist import flights_dataset
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, LongType

spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()

schema = StructType([StructField("Origin_airport",StringType(), True),
                     StructField("Destination_airport", StringType(), True),
                     StructField("Origin_city", StringType(), True),
                     StructField("Destination_city", StringType(), True),
                     StructField("Passengers", IntegerType(), True),
                     StructField("Seats", IntegerType(), True),
                     StructField("Flights", IntegerType(), True),
                     StructField("Distance", LongType(), True),
                     StructField("Fly_date", DateType(), True),
                     StructField("Origin_population", LongType(), True),
                     StructField("Destination_population", LongType(), True),
                     StructField("Org_airport_lat", StringType(), True),
                     StructField("Org_airport_long", StringType(), True),
                     StructField("Dest_airport_lat", StringType(), True),
                     StructField("Dest_airport_long", StringType(), True),
                     
                     ])

print(flights_dataset)
dataframe = spark.read.options(header=True, nullValue='NA').csv(flights_dataset, schema=schema)

print("Total row count: ", dataframe.count())
dataframe.printSchema()
dataframe.show(5)

25/05/16 13:07:11 WARN Utils: Your hostname, Nothing-Is-Real resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/05/16 13:07:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/16 13:07:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


/home/aman/datasets/Airports2.csv


                                                                                

Total row count:  3606803
root
 |-- Origin_airport: string (nullable = true)
 |-- Destination_airport: string (nullable = true)
 |-- Origin_city: string (nullable = true)
 |-- Destination_city: string (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- Seats: integer (nullable = true)
 |-- Flights: integer (nullable = true)
 |-- Distance: long (nullable = true)
 |-- Fly_date: date (nullable = true)
 |-- Origin_population: long (nullable = true)
 |-- Destination_population: long (nullable = true)
 |-- Org_airport_lat: string (nullable = true)
 |-- Org_airport_long: string (nullable = true)
 |-- Dest_airport_lat: string (nullable = true)
 |-- Dest_airport_long: string (nullable = true)

+--------------+-------------------+-------------+----------------+----------+-----+-------+--------+----------+-----------------+----------------------+----------------+-----------------+----------------+-----------------+
|Origin_airport|Destination_airport|  Origin_city|Destination_city|P

25/05/16 13:07:33 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Question 1: Top 5 busiest airports by total passenger traffic
Find the top 5 airports (considering both origin and destination) with the highest total number of passengers.

In [None]:
# Your solution here
# Sol: Adding all the passengers for the same airport as origin and destination
# and then joining the two dataframes to get the total passengers for each airport
from pyspark.sql import functions as f
df1 = dataframe.groupBy("Origin_airport").agg(f.sum('Passengers').alias('Total_Passengers')).orderBy('Total_Passengers', ascending=False)
df2 = dataframe.groupBy("Destination_airport").agg(f.sum('Passengers').alias('Total_Passengers')).orderBy('Total_Passengers', ascending=False)
df3 = df1.join(df2, df1.Origin_airport == df2.Destination_airport, 'inner').select((df1.Origin_airport).alias('Airport'),
                                                                                    df1.Total_Passengers.alias('Total_Passengers_Origin'), df2.Total_Passengers.alias('Total_Passengers_Destination'))
df3 = df3.groupBy("Airport").agg((f.sum(f.col('Total_Passengers_Origin') + f.col('Total_Passengers_Destination'))).alias('Total_Passengers')).orderBy('Total_Passengers', ascending=False)
df3.show(5)


# The above code can be imporved by using full outer join or union all to take care
#  of the airports which are present in either of the dataframes

                                                                                

+-------+----------------+
|Airport|Total_Passengers|
+-------+----------------+
|    ATL|      1155078415|
|    ORD|      1057666258|
|    DFW|       915476247|
|    LAX|       782482278|
|    PHX|       591438147|
+-------+----------------+
only showing top 5 rows



## Question 2: Monthly trend of total flights
Calculate the total number of flights for each month across all years in the dataset. Show the results ordered by month.

In [None]:
# Your solution here
# Sol: Adding all the flights for all the months and year wrt the year and month
# and then ordering the dataframe by year and month
df = dataframe.select(f.year(dataframe.Fly_date).alias('Year'), f.month(dataframe.Fly_date).alias('Month'), dataframe.Flights).groupBy('Year', 'Month')\
    .agg(f.sum(dataframe.Flights).alias('Total_Flights')).orderBy('Year', 'Month')

df.show(20)

[Stage 25:>                                                       (0 + 12) / 12]

+----+-----+-------------+
|Year|Month|Total_Flights|
+----+-----+-------------+
|1990|    1|       422366|
|1990|    2|       379961|
|1990|    3|       429472|
|1990|    4|       420019|
|1990|    5|       435517|
|1990|    6|       426642|
|1990|    7|       441353|
|1990|    8|       446416|
|1990|    9|       421588|
|1990|   10|       442574|
|1990|   11|       415163|
|1990|   12|       416165|
|1991|    1|       423954|
|1991|    2|       372213|
|1991|    3|       417373|
|1991|    4|       420706|
|1991|    5|       430957|
|1991|    6|       426659|
|1991|    7|       443201|
|1991|    8|       445031|
+----+-----+-------------+
only showing top 20 rows



                                                                                

## Question 3: Flight routes with consistently high seat occupancy
Identify flight routes where the average seat occupancy (Passengers / Seats) is greater than 80% across all flights.

In [None]:
# Your solution here

df = dataframe.groupBy('Origin_airport', 'Destination_airport').agg(f.avg(dataframe.Passengers/dataframe.Seats).alias('Average_Seat_Occupancy')).orderBy('Average_Seat_Occupancy', ascending=False)
df.show(5)

dataframe.filter((dataframe.Origin_airport == 'TTN') & (dataframe.Destination_airport == 'PHL')).show(5)


                                                                                

+--------------+-------------------+----------------------+
|Origin_airport|Destination_airport|Average_Seat_Occupancy|
+--------------+-------------------+----------------------+
|           TTN|                PHL|    1.4609043715846997|
|           RDU|                PKB|    1.0169491525423728|
|           BNA|                ERI|                   1.0|
|           IAD|                CPR|                   1.0|
|           ITH|                CAK|                   1.0|
+--------------+-------------------+----------------------+
only showing top 5 rows





+--------------+-------------------+-----------+----------------+----------+-----+-------+--------+----------+-----------------+----------------------+----------------+-----------------+----------------+-----------------+
|Origin_airport|Destination_airport|Origin_city|Destination_city|Passengers|Seats|Flights|Distance|  Fly_date|Origin_population|Destination_population| Org_airport_lat| Org_airport_long|Dest_airport_lat|Dest_airport_long|
+--------------+-------------------+-----------+----------------+----------+-----+-------+--------+----------+-----------------+----------------------+----------------+-----------------+----------------+-----------------+
|           TTN|                PHL|Trenton, NJ|Philadelphia, PA|      1349|  122|      3|      37|1998-11-01|           331474|              11020546|40.2766990661621|-74.8134994506836| 39.871898651123| -75.241096496582|
|           TTN|                PHL|Trenton, NJ|Philadelphia, PA|       107|  264|      2|      37|1998-12-01|  

                                                                                

## Question 4: Correlation between flight distance and seat occupancy
Analyze if there is any correlation between the flight distance and the average seat occupancy on the route.

In [21]:
# Your solution here

df = dataframe.select('Distance', (dataframe.Passengers/dataframe.Seats).alias('Seat_Occupancy'))
print(df.corr('Distance', 'Seat_Occupancy'))

[Stage 56:>                                                       (0 + 12) / 12]

0.12870328919786145


                                                                                

## Question 5: Identify routes with significant seasonal variation in passenger numbers
Find routes where the number of passengers varies significantly between different quarters of the year.

In [24]:
# Your solution here
from pyspark.sql.functions import month, when

df = dataframe.withColumn("Season", 
    when((month("Fly_date") >= 3) & (month("Fly_date") <= 5), "Spring")
    .when((month("Fly_date") >= 6) & (month("Fly_date") <= 8), "Summer")
    .when((month("Fly_date") >= 9) & (month("Fly_date") <= 11), "Fall")
    .otherwise("Winter")
)

df.groupBy(f.year(df.Fly_date).alias("Year"), "Season").agg(f.sum(df.Passengers).alias("Total_Passengers")).orderBy("Year", "Season").show()




+----+------+----------------+
|Year|Season|Total_Passengers|
+----+------+----------------+
|1990|  Fall|        89382468|
|1990|Spring|        95329293|
|1990|Summer|       102008395|
|1990|Winter|        83684256|
|1991|  Fall|        88584569|
|1991|Spring|        92339403|
|1991|Summer|       101309461|
|1991|Winter|        84456508|
|1992|  Fall|        93555501|
|1992|Spring|        91795851|
|1992|Summer|       114822083|
|1992|Winter|        83149867|
|1993|  Fall|        99295350|
|1993|Spring|        97652028|
|1993|Summer|       107310412|
|1993|Winter|        86838751|
|1994|  Fall|       107818148|
|1994|Spring|       106991375|
|1994|Summer|       117574177|
|1994|Winter|        93747021|
+----+------+----------------+
only showing top 20 rows



                                                                                