# Joining Data

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
airports = spark.read.csv("airports.csv", header=True, inferSchema=True)
flights = spark.read.csv("flights_small.csv", header=True, inferSchema=True)
planes = spark.read.csv("planes.csv", header=True, inferSchema=True)

In [2]:
airports.show(5)

+---+--------------------+----------+-----------+----+---+---+
|faa|                name|       lat|        lon| alt| tz|dst|
+---+--------------------+----------+-----------+----+---+---+
|04G|   Lansdowne Airport|41.1304722|-80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|32.4605722|-85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|41.9893408|-88.1012428| 801| -6|  A|
|06N|     Randall Airport| 41.431912|-74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|31.0744722|-81.4277778|  11| -4|  A|
+---+--------------------+----------+-----------+----+---+---+
only showing top 5 rows



In [3]:
airports.printSchema()

root
 |-- faa: string (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- alt: integer (nullable = true)
 |-- tz: integer (nullable = true)
 |-- dst: string (nullable = true)



In [4]:
flights.show(5)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
+----+-----+---+--------+---------+-----

In [5]:
planes.show(5)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
only showing top 5 rows



In [6]:
airports = airports.withColumnRenamed("faa", "dest")
flights_with_airports = flights.join(airports, on="dest", how="left")

flights_with_airports.show(10)

+----+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+--------+--------+----+------+--------------------+---------+-----------+----+---+---+
|dest|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|air_time|distance|hour|minute|                name|      lat|        lon| alt| tz|dst|
+----+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+--------+--------+----+------+--------------------+---------+-----------+----+---+---+
| LAX|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA|     132|     954|   6|    58|    Los Angeles Intl|33.942536|-118.408075| 126| -8|  A|
| HNL|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA|     360|    2677|  10|    40|       Honolulu Intl|21.318681|-157.922428|  13|-10|  N|
| SFO|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA|     111|     679|  14|    43| 

In [7]:
flights_with_planes = flights.join(planes, on="tailnum", how="left")
flights_with_planes.show(10)

+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----+--------------------+------------+--------+-------+-----+-----+---------+
|tailnum|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|flight|origin|dest|air_time|distance|hour|minute|year|                type|manufacturer|   model|engines|seats|speed|   engine|
+-------+----+-----+---+--------+---------+--------+---------+-------+------+------+----+--------+--------+----+------+----+--------------------+------------+--------+-------+-----+-----+---------+
| N846VA|2014|   12|  8|     658|       -7|     935|       -5|     VX|  1780|   SEA| LAX|     132|     954|   6|    58|2011|Fixed wing multi ...|      AIRBUS|A320-214|      2|  182|   NA|Turbo-fan|
| N559AS|2014|    1| 22|    1040|        5|    1505|        5|     AS|   851|   SEA| HNL|     360|    2677|  10|    40|2006|Fixed wing multi ...|      BOEING| 737-890|      2|  149|   NA|Turbo-fan|
| N847VA|2

## Filtering

### Filtering with SQL-like Query

In [8]:
# Show flights origin, dest, distance where manufacturer is AIRBUS
flights_with_planes.select('origin', 'dest', 'distance').filter("manufacturer == 'AIRBUS'").show()

+------+----+--------+
|origin|dest|distance|
+------+----+--------+
|   SEA| LAX|     954|
|   SEA| SFO|     679|
|   SEA| SFO|     679|
|   SEA| PHL|    2378|
|   SEA| OGG|    2640|
|   SEA| LGB|     965|
|   SEA| ANC|    1448|
|   PDX| DEN|     991|
|   PDX| PHX|    1009|
|   PDX| HNL|    2603|
|   SEA| DEN|    1024|
|   SEA| LAX|     954|
|   SEA| DEN|    1024|
|   SEA| LAX|     954|
|   PDX| LGB|     846|
|   SEA| JFK|    2422|
|   PDX| DEN|     991|
|   SEA| CLT|    2279|
|   SEA| SFO|     679|
|   PDX| PHX|    1009|
+------+----+--------+
only showing top 20 rows



### Filtering with PySPark functions

In [9]:
# Show flights origin, dest, distance where origin is SEA and distance > 1000
from pyspark.sql import functions as F
flights_with_planes.select('origin', 'dest', 'distance').filter(
    (F.col("origin") == 'SEA') &
    (F.col("distance") > 1000)   
).show()

+------+----+--------+
|origin|dest|distance|
+------+----+--------+
|   SEA| HNL|    2677|
|   SEA| SAN|    1050|
|   SEA| ORD|    1721|
|   SEA| PHX|    1107|
|   SEA| ANC|    1448|
|   SEA| MDW|    1733|
|   SEA| BOS|    2496|
|   SEA| PHL|    2378|
|   SEA| OGG|    2640|
|   SEA| SAN|    1050|
|   SEA| ANC|    1448|
|   SEA| LIH|    2701|
|   SEA| MCO|    2554|
|   SEA| ATL|    2182|
|   SEA| ANC|    1448|
|   SEA| ABQ|    1180|
|   SEA| IAH|    1874|
|   SEA| PHX|    1107|
|   SEA| PHX|    1107|
|   SEA| TUS|    1216|
+------+----+--------+
only showing top 20 rows



In [10]:
# Or we can use variable to save the data
flights_from_sea_1k = flights_with_planes.select('origin', 'dest', 'distance').filter(
    (F.col("origin") == 'SEA') &
    (F.col("distance") > 1000)   
)

In [11]:
flights_from_sea_1k.show()

+------+----+--------+
|origin|dest|distance|
+------+----+--------+
|   SEA| HNL|    2677|
|   SEA| SAN|    1050|
|   SEA| ORD|    1721|
|   SEA| PHX|    1107|
|   SEA| ANC|    1448|
|   SEA| MDW|    1733|
|   SEA| BOS|    2496|
|   SEA| PHL|    2378|
|   SEA| OGG|    2640|
|   SEA| SAN|    1050|
|   SEA| ANC|    1448|
|   SEA| LIH|    2701|
|   SEA| MCO|    2554|
|   SEA| ATL|    2182|
|   SEA| ANC|    1448|
|   SEA| ABQ|    1180|
|   SEA| IAH|    1874|
|   SEA| PHX|    1107|
|   SEA| PHX|    1107|
|   SEA| TUS|    1216|
+------+----+--------+
only showing top 20 rows



In [12]:
flights_from_sea_1k.count()

3671

## Grouping and Aggregation

In [13]:
# Select average distance for each manufacturer
flights_with_planes.groupBy("manufacturer").agg(
    F.avg("distance").alias("avg_dist")
).show()

+--------------------+------------------+
|        manufacturer|          avg_dist|
+--------------------+------------------+
|       BARKER JACK L|             965.0|
|    AIRBUS INDUSTRIE|1318.1736204576043|
|ROBINSON HELICOPT...| 925.3333333333334|
|                null|1621.9420289855072|
|              BOEING| 1245.056006006006|
|             EMBRAER|127.29304029304029|
|  CIRRUS DESIGN CORP|            1693.5|
|              CESSNA|            1616.0|
|      BOMBARDIER INC| 749.3497206703911|
|                BELL|            1616.0|
|   MCDONNELL DOUGLAS| 1554.787234042553|
|            CANADAIR|            731.25|
|              AIRBUS|1375.0628853267572|
+--------------------+------------------+



In [14]:
# It seemed that the dataset in not properly cleaned, there are null value for manufacturer
# Lets filter it out
flights_with_planes.filter("manufacturer is not null").groupBy("manufacturer").agg(
    F.avg("distance").alias("avg_dist")
).show()

+--------------------+------------------+
|        manufacturer|          avg_dist|
+--------------------+------------------+
|       BARKER JACK L|             965.0|
|    AIRBUS INDUSTRIE|1318.1736204576043|
|ROBINSON HELICOPT...| 925.3333333333334|
|              BOEING| 1245.056006006006|
|             EMBRAER|127.29304029304029|
|  CIRRUS DESIGN CORP|            1693.5|
|              CESSNA|            1616.0|
|      BOMBARDIER INC| 749.3497206703911|
|                BELL|            1616.0|
|   MCDONNELL DOUGLAS| 1554.787234042553|
|            CANADAIR|            731.25|
|              AIRBUS|1375.0628853267572|
+--------------------+------------------+



## Sorting data with OrderBy

In [15]:
flights_with_planes.filter("manufacturer is not null").groupBy("manufacturer").agg(
    F.avg("distance").alias("avg_dist")
).orderBy(
    F.col("avg_dist"), ascending=False
).show()

+--------------------+------------------+
|        manufacturer|          avg_dist|
+--------------------+------------------+
|  CIRRUS DESIGN CORP|            1693.5|
|              CESSNA|            1616.0|
|                BELL|            1616.0|
|   MCDONNELL DOUGLAS| 1554.787234042553|
|              AIRBUS|1375.0628853267572|
|    AIRBUS INDUSTRIE|1318.1736204576043|
|              BOEING| 1245.056006006006|
|       BARKER JACK L|             965.0|
|ROBINSON HELICOPT...| 925.3333333333334|
|      BOMBARDIER INC| 749.3497206703911|
|            CANADAIR|            731.25|
|             EMBRAER|127.29304029304029|
+--------------------+------------------+

