In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col

spark = (SparkSession.builder.appName("Flights Summary").getOrCreate())

flights_csv = 'data/flights/flight-summary.csv'

In [2]:
'''
flights_schema = StructType([
    StructField('Order ID', StringType(), True),
    StructField('Product', StringType(), True),
    StructField('Quantity Ordered', StringType(), True),
    StructField('Price Each', StringType(), True),
    StructField('Order Date', StringType(), True),
    StructField('Purchase Address', StringType(), True)
])
'''

flights_df = (spark.read.format('csv')
                .option('header', True)
                .option('inferSchema', True)
                .load(flights_csv))


In [3]:
flights_df = flights_df.withColumnRenamed('count', 'flight_count')

flights_df.show()

+-----------+--------------------+---------------+------------+---------+--------------------+----------------+----------+------------+
|origin_code|      origin_airport|    origin_city|origin_state|dest_code|        dest_airport|       dest_city|dest_state|flight_count|
+-----------+--------------------+---------------+------------+---------+--------------------+----------------+----------+------------+
|        BQN|Rafael Hernández ...|      Aguadilla|          PR|      MCO|Orlando Internati...|         Orlando|        FL|         441|
|        PHL|Philadelphia Inte...|   Philadelphia|          PA|      MCO|Orlando Internati...|         Orlando|        FL|        4869|
|        MCI|Kansas City Inter...|    Kansas City|          MO|      IAH|George Bush Inter...|         Houston|        TX|        1698|
|        SPI|Abraham Lincoln C...|    Springfield|          IL|      ORD|Chicago O'Hare In...|         Chicago|        IL|         998|
|        SNA|John Wayne Airpor...|      Santa An

## min(col), max(col) sum distinct(col) and avg

In [4]:
from pyspark.sql.functions import min, max, sum, sumDistinct, avg

flights_df.select(min('flight_count'), max('flight_count')).show()

+-----------------+-----------------+
|min(flight_count)|max(flight_count)|
+-----------------+-----------------+
|                1|            13744|
+-----------------+-----------------+



In [5]:
flights_df.select(sum('flight_count')).show()

+-----------------+
|sum(flight_count)|
+-----------------+
|          5332914|
+-----------------+



In [6]:
flights_df.select(sum('flight_count') / flights_df.count()).show()

+--------------------------+
|(sum(flight_count) / 4693)|
+--------------------------+
|        1136.3549968037503|
+--------------------------+



## Aggregation and grouping

In [7]:
flights_df.groupBy('origin_airport').count().orderBy('count', ascending=False).show()

+--------------------+-----+
|      origin_airport|count|
+--------------------+-----+
|Hartsfield-Jackso...|  169|
|Chicago O'Hare In...|  162|
|Dallas/Fort Worth...|  148|
|Denver Internatio...|  139|
|Minneapolis-Saint...|  120|
|George Bush Inter...|  119|
|Detroit Metropoli...|  112|
|Salt Lake City In...|   89|
|Newark Liberty In...|   88|
|Los Angeles Inter...|   80|
|San Francisco Int...|   80|
|Phoenix Sky Harbo...|   79|
|McCarran Internat...|   78|
|Orlando Internati...|   74|
|Seattle-Tacoma In...|   73|
|Washington Dulles...|   71|
|LaGuardia Airport...|   69|
|Chicago Midway In...|   69|
|Charlotte Douglas...|   69|
|Baltimore-Washing...|   67|
+--------------------+-----+
only showing top 20 rows



In [8]:
(flights_df.groupBy('origin_airport')
            .agg(max('flight_count').alias('max_flight_count'))
            .orderBy('max_flight_count', ascending=False).show(5, False))

+----------------------------------------------------------------------+----------------+
|origin_airport                                                        |max_flight_count|
+----------------------------------------------------------------------+----------------+
|San Francisco International Airport                                   |13744           |
|Los Angeles International Airport                                     |13457           |
|John F. Kennedy International Airport (New York International Airport)|12016           |
|McCarran International Airport                                        |9715            |
|LaGuardia Airport (Marine Air Terminal)                               |9639            |
+----------------------------------------------------------------------+----------------+
only showing top 5 rows



In [19]:
flights_df.select('origin_airport', 'flight_count').where(col('origin_airport') == 'San Francisco International Airport')
flights_df.select('origin_city','dest_city', 'flight_count').where(col('origin_state') == 'PR').show(50) 

+-----------+-----------------+------------+
|origin_city|        dest_city|flight_count|
+-----------+-----------------+------------+
|  Aguadilla|          Orlando|         441|
|   San Juan|          Detroit|          25|
|   San Juan| Charlotte Amalie|         668|
|  Aguadilla|   Ft. Lauderdale|         164|
|   San Juan|    Windsor Locks|         456|
|   San Juan|          Orlando|        3552|
|   San Juan|          Chicago|        1022|
|   San Juan|            Tampa|        1090|
|   San Juan|   Ft. Lauderdale|        2791|
|   San Juan|        Charlotte|         713|
|   San Juan|        Cleveland|          43|
|  Aguadilla|           Newark|         291|
|   San Juan|        Baltimore|         621|
|   San Juan|          Houston|          38|
|   San Juan|           Newark|        1195|
|  Aguadilla|         New York|         447|
|   San Juan|         New York|        4466|
|      Ponce|         New York|         333|
|   San Juan|Dallas-Fort Worth|         588|
|   San Ju

In [34]:
(flights_df.groupBy('origin_state', 'origin_city').count().where(col('origin_state') == 'PR').orderBy('count', ascending=False))



DataFrame[origin_state: string, origin_city: string, count: bigint]