In [5]:
! pip install pyspark
! pip install findspark



In [229]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Semi_Sructured_Data_Analysis") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()



### Number of Delays:
1. **Carrier**: The number of delays and cancellations due to circumstances within the airline's control (e.g. maintenance or crew problems, aircraft cleaning, baggage loading, fueling, etc.) in this month.
2. **Late Aircraft**: The number of delays and cancellations caused by a previous flight with the same aircraft arriving late, causing the present flight to depart late in this month.	
3. **National Aviation System**: The number of delays and cancellations attributable to the national aviation system that refer to a broad set of conditions, such as non-extreme weather conditions, airport operations, heavy traffic volume, and air traffic control in this month.
4. **Security**: Number of delays or cancellations caused by evacuation of a terminal or concourse, re-boarding of aircraft because of security breach, inoperative screening equipment and/or long lines in excess of 29 minutes at screening areas in this month.
5. **Weather**: Number of delays or cancellations caused by significant meteorological conditions (actual or forecasted) that, in the judgment of the carrier, delays or prevents the operation of a flight such as tornado, blizzard or hurricane in this month.	

In [148]:
import requests, json

url = 'https://think.cs.vt.edu/corgis/datasets/json/airlines/airlines.json'
resp = requests.get(url=url)
json_string = json.dumps(resp.json())
json_data= json.loads(json_string)

with open('airlines.json', 'w') as json_file:
    json.dump(json_data, json_file)

In [231]:
df = spark.read.json("airlines.json")
df.show(6)

+--------------------+--------------------+--------------------+
|             Airport|          Statistics|                Time|
+--------------------+--------------------+--------------------+
|{ATL, Atlanta, GA...|{{1009, 1275, 321...|{2003/06, 6, June...|
|{BOS, Boston, MA:...|{{374, 495, 685, ...|{2003/06, 6, June...|
|{BWI, Baltimore, ...|{{296, 477, 389, ...|{2003/06, 6, June...|
|{CLT, Charlotte, ...|{{300, 472, 735, ...|{2003/06, 6, June...|
|{DCA, Washington,...|{{283, 268, 487, ...|{2003/06, 6, June...|
|{DEN, Denver, CO:...|{{516, 323, 664, ...|{2003/06, 6, June...|
+--------------------+--------------------+--------------------+
only showing top 6 rows



In [233]:
df= spark.read.option("multiline","true").json("airlines.json")
#df.show(6)

In [152]:
# Understanding the schema of the JSON through the dataframe
df.printSchema()

root
 |-- Airport: struct (nullable = true)
 |    |-- Code: string (nullable = true)
 |    |-- Name: string (nullable = true)
 |-- Statistics: struct (nullable = true)
 |    |-- # of Delays: struct (nullable = true)
 |    |    |-- Carrier: long (nullable = true)
 |    |    |-- Late Aircraft: long (nullable = true)
 |    |    |-- National Aviation System: long (nullable = true)
 |    |    |-- Security: long (nullable = true)
 |    |    |-- Weather: long (nullable = true)
 |    |-- Carriers: struct (nullable = true)
 |    |    |-- Names: string (nullable = true)
 |    |    |-- Total: long (nullable = true)
 |    |-- Flights: struct (nullable = true)
 |    |    |-- Cancelled: long (nullable = true)
 |    |    |-- Delayed: long (nullable = true)
 |    |    |-- Diverted: long (nullable = true)
 |    |    |-- On Time: long (nullable = true)
 |    |    |-- Total: long (nullable = true)
 |    |-- Minutes Delayed: struct (nullable = true)
 |    |    |-- Carrier: long (nullable = true)
 |    |  

In [235]:

# Subsetting the dataframe to analyse the statistics of # (number) of delays
airport_delay_statistics = df.select("Airport.Code", "Airport.Name", 
                                     "Statistics.# of Delays.Carrier", 
                                     "Statistics.# of Delays.Late Aircraft",
                                     "Statistics.# of Delays.Security", 
                                     "Statistics.# of Delays.Weather")
airport_delay_statistics.printSchema()


root
 |-- Code: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Carrier: long (nullable = true)
 |-- Late Aircraft: long (nullable = true)
 |-- Security: long (nullable = true)
 |-- Weather: long (nullable = true)



In [325]:
# Check which input file has created this dataframe
airport_delay_statistics.inputFiles()

['file:///Users/ankitbit/Documents/Lectures/Real-Time-Data-Analysis/airlines.json']

In [241]:

# Creating summary statistics of the dataframe attributes
airport_delay_statistics.select("Carrier", "Late Aircraft",
                                "Security","Weather").describe().show()


+-------+------------------+----------------+-----------------+-----------------+
|summary|           Carrier|   Late Aircraft|         Security|          Weather|
+-------+------------------+----------------+-----------------+-----------------+
|  count|              4408|            4408|             4408|             4408|
|   mean| 574.6324863883848|789.078947368421|  5.5755444646098|78.21687840290382|
| stddev|329.61647461501815| 561.79842030889|6.007046080059749|75.18172623192343|
|    min|               112|              86|               -1|                1|
|    max|              3087|            4483|               94|              812|
+-------+------------------+----------------+-----------------+-----------------+



In [248]:
# Sorting the airport delay statistics based on the Weather
airport_statistics.select("Code", "Name", "Weather").sort("Weather").show(10)

+----+--------------------+-------+
|Code|                Name|Weather|
+----+--------------------+-------+
| MDW|Chicago, IL: Chic...|      1|
| PDX|Portland, OR: Por...|      2|
| BWI|Baltimore, MD: Ba...|      2|
| FLL|Fort Lauderdale, ...|      2|
| PDX|Portland, OR: Por...|      3|
| IAD|Washington, DC: W...|      3|
| MCO|Orlando, FL: Orla...|      4|
| PDX|Portland, OR: Por...|      4|
| FLL|Fort Lauderdale, ...|      4|
| MDW|Chicago, IL: Chic...|      4|
+----+--------------------+-------+
only showing top 10 rows



In [251]:

# Sorting the airport delay statistics based on the Weather
airport_statistics.select("Code", "Name", "Weather").sort("Weather").show(10, truncate= False)

+----+-------------------------------------------------------------------+-------+
|Code|Name                                                               |Weather|
+----+-------------------------------------------------------------------+-------+
|MDW |Chicago, IL: Chicago Midway International                          |1      |
|PDX |Portland, OR: Portland International                               |2      |
|BWI |Baltimore, MD: Baltimore/Washington International Thurgood Marshall|2      |
|FLL |Fort Lauderdale, FL: Fort Lauderdale-Hollywood International       |2      |
|PDX |Portland, OR: Portland International                               |3      |
|IAD |Washington, DC: Washington Dulles International                    |3      |
|MCO |Orlando, FL: Orlando International                                 |4      |
|PDX |Portland, OR: Portland International                               |4      |
|FLL |Fort Lauderdale, FL: Fort Lauderdale-Hollywood International       |4      |
|MDW

In [253]:
airport_delay_statistics.count()

4408

In [202]:

# Find the list of most affected airports due to delays caused by weather
from pyspark.sql.functions import col, sum, count
airport_statistics.select("Code", "Name", "Weather").groupBy("Code").sum("Weather").show(3)#.sort(col("Weather").desc()).show(5, truncate=False)


+----+------------+
|Code|sum(Weather)|
+----+------------+
| DCA|        8504|
| IAH|       13062|
| LGA|       16350|
+----+------------+
only showing top 3 rows



In [264]:

# Using groupby to find the list of least affected unique airports
from pyspark.sql.functions import col, sum, count
airport_statistics.select("Code", "Name", "Weather") \
    .groupBy("Code", "Name").agg(sum("Weather") \
    .alias("Weather")).sort(col("Weather")).show(10, truncate=False)


+----+------------------------------------------------------------+-------+
|Code|Name                                                        |Weather|
+----+------------------------------------------------------------+-------+
|PDX |Portland, OR: Portland International                        |2791   |
|SAN |San Diego, CA: San Diego International                      |5056   |
|FLL |Fort Lauderdale, FL: Fort Lauderdale-Hollywood International|5084   |
|TPA |Tampa, FL: Tampa International                              |5092   |
|IAD |Washington, DC: Washington Dulles International             |5849   |
|SEA |Seattle, WA: Seattle/Tacoma International                   |5953   |
|MIA |Miami, FL: Miami International                              |7487   |
|CLT |Charlotte, NC: Charlotte Douglas International              |7738   |
|MCO |Orlando, FL: Orlando International                          |8124   |
|DCA |Washington, DC: Ronald Reagan Washington National           |8504   |
+----+------

In [265]:

# Using groupby to find the list of most affected unique airports
from pyspark.sql.functions import col, sum, count
airport_statistics.select("Code", "Name", "Weather") \
    .groupBy("Code", "Name").agg(sum("Weather") \
    .alias("Weather")).sort(col("Weather").desc()).show(10, truncate=False)

+----+------------------------------------------------------+-------+
|Code|Name                                                  |Weather|
+----+------------------------------------------------------+-------+
|ATL |Atlanta, GA: Hartsfield-Jackson Atlanta International |40113  |
|DFW |Dallas/Fort Worth, TX: Dallas/Fort Worth International|30476  |
|ORD |Chicago, IL: Chicago O'Hare International             |24358  |
|LGA |New York, NY: LaGuardia                               |16350  |
|DEN |Denver, CO: Denver International                      |15556  |
|EWR |Newark, NJ: Newark Liberty International              |14668  |
|LAX |Los Angeles, CA: Los Angeles International            |14652  |
|IAH |Houston, TX: George Bush Intercontinental/Houston     |13062  |
|BOS |Boston, MA: Logan International                       |11955  |
|SFO |San Francisco, CA: San Francisco International        |11751  |
+----+------------------------------------------------------+-------+
only showing top 10 

In [263]:

# Find the list of most affected airports due to delays caused by security
airport_statistics.select("Code", "Name", "Security") \
    .groupBy("Code", "Name").agg(sum("Security") \
    .alias("Security")).sort(col("Security").desc()).show(10, truncate=False)


+----+------------------------------------------------------+--------+
|Code|Name                                                  |Security|
+----+------------------------------------------------------+--------+
|LAX |Los Angeles, CA: Los Angeles International            |1738    |
|PHX |Phoenix, AZ: Phoenix Sky Harbor International         |1715    |
|IAH |Houston, TX: George Bush Intercontinental/Houston     |1355    |
|LAS |Las Vegas, NV: McCarran International                 |1288    |
|SLC |Salt Lake City, UT: Salt Lake City International      |1201    |
|SEA |Seattle, WA: Seattle/Tacoma International             |1125    |
|DEN |Denver, CO: Denver International                      |1122    |
|DFW |Dallas/Fort Worth, TX: Dallas/Fort Worth International|1098    |
|ATL |Atlanta, GA: Hartsfield-Jackson Atlanta International |1066    |
|ORD |Chicago, IL: Chicago O'Hare International             |1027    |
+----+------------------------------------------------------+--------+
only s

In [267]:
 airport_delay_statistics.columns

['Code', 'Name', 'Carrier', 'Late Aircraft', 'Security', 'Weather']

In [270]:
airport_delay_statistics.dtypes

[('Code', 'string'),
 ('Name', 'string'),
 ('Carrier', 'bigint'),
 ('Late Aircraft', 'bigint'),
 ('Security', 'bigint'),
 ('Weather', 'bigint')]

In [274]:
airport_delay_statistics.cov("Carrier", "Weather")

18080.795857275385

In [283]:
# Creating a summary statistics of the dataframe
airport_delay_statistics.describe().show()

+-------+----+--------------------+------------------+----------------+-----------------+-----------------+
|summary|Code|                Name|           Carrier|   Late Aircraft|         Security|          Weather|
+-------+----+--------------------+------------------+----------------+-----------------+-----------------+
|  count|4408|                4408|              4408|            4408|             4408|             4408|
|   mean|null|                null| 574.6324863883848|789.078947368421|  5.5755444646098|78.21687840290382|
| stddev|null|                null|329.61647461501815| 561.79842030889|6.007046080059749|75.18172623192343|
|    min| ATL|Atlanta, GA: Hart...|               112|              86|               -1|                1|
|    max| TPA|Washington, DC: W...|              3087|            4483|               94|              812|
+-------+----+--------------------+------------------+----------------+-----------------+-----------------+



In [282]:
# Creating a summary statistics by without columns having NAs 
airport_delay_statistics.describe().dropna().show()

+-------+----+--------------------+-------+-------------+--------+-------+
|summary|Code|                Name|Carrier|Late Aircraft|Security|Weather|
+-------+----+--------------------+-------+-------------+--------+-------+
|  count|4408|                4408|   4408|         4408|    4408|   4408|
|    min| ATL|Atlanta, GA: Hart...|    112|           86|      -1|      1|
|    max| TPA|Washington, DC: W...|   3087|         4483|      94|    812|
+-------+----+--------------------+-------+-------------+--------+-------+



In [284]:
# Creating a summary statistics by without columns having NAs 
airport_delay_statistics.describe().drop("Code", "Name").show()

+-------+------------------+----------------+-----------------+-----------------+
|summary|           Carrier|   Late Aircraft|         Security|          Weather|
+-------+------------------+----------------+-----------------+-----------------+
|  count|              4408|            4408|             4408|             4408|
|   mean| 574.6324863883848|789.078947368421|  5.5755444646098|78.21687840290382|
| stddev|329.61647461501815| 561.79842030889|6.007046080059749|75.18172623192343|
|    min|               112|              86|               -1|                1|
|    max|              3087|            4483|               94|              812|
+-------+------------------+----------------+-----------------+-----------------+



### Filtering rows 

* Discuss more about the various logical operations that can take place apart from OR such as AND, >, < etc
* discuss about various programming methods to implement the same operation
* explain how to extract the first row using FIRST
* explain how to extract the first N rows using HEAD
* 

In [316]:
# Find the list of most affected airports due to delays caused by security
# Filtering rows using a WHERE clause
airport_statistics.select("Code", "Name", "Security") \
    .groupBy("Code", "Name").agg(sum("Security") \
    .alias("Security")).sort(col("Security").desc()) \
    .where((col("Code")== "ATL") | (col("Code")== "TPA")) \
    .show(10, truncate=False)

+----+-----------------------------------------------------+--------+
|Code|Name                                                 |Security|
+----+-----------------------------------------------------+--------+
|ATL |Atlanta, GA: Hartsfield-Jackson Atlanta International|1066    |
|TPA |Tampa, FL: Tampa International                       |500     |
+----+-----------------------------------------------------+--------+



In [317]:
# Find the list of most affected airports due to delays caused by security
# Filtering rows using a FILTER clause
airport_statistics.select("Code", "Name", "Security") \
    .groupBy("Code", "Name").agg(sum("Security") \
    .alias("Security")).sort(col("Security").desc()) \
    .filter((col("Code")== "ATL") | (col("Code")== "TPA")) \
    .show(10, truncate=False)

+----+-----------------------------------------------------+--------+
|Code|Name                                                 |Security|
+----+-----------------------------------------------------+--------+
|ATL |Atlanta, GA: Hartsfield-Jackson Atlanta International|1066    |
|TPA |Tampa, FL: Tampa International                       |500     |
+----+-----------------------------------------------------+--------+



In [318]:
# Find the list of most affected airports due to delays caused by security
# Filtering rows using a WHERE clause

condition_1= (col("Code")== "ATL")
condition_2= (col("Code")== "TPA")

airport_statistics.select("Code", "Name", "Security") \
    .groupBy("Code", "Name").agg(sum("Security") \
    .alias("Security")).sort(col("Security").desc()) \
    .where(condition_1 | condition_2) \
    .show(10, truncate=False)

+----+-----------------------------------------------------+--------+
|Code|Name                                                 |Security|
+----+-----------------------------------------------------+--------+
|ATL |Atlanta, GA: Hartsfield-Jackson Atlanta International|1066    |
|TPA |Tampa, FL: Tampa International                       |500     |
+----+-----------------------------------------------------+--------+



In [327]:
airport_delay_statistics.isStreaming

False

In [330]:
airport_delay_statistics.na

<pyspark.sql.dataframe.DataFrameNaFunctions at 0x7f9ef0c9f780>

In [331]:
airport_delay_statistics.persist

<bound method DataFrame.persist of DataFrame[Code: string, Name: string, Carrier: bigint, Late Aircraft: bigint, Security: bigint, Weather: bigint]>

In [338]:
split_1, split_2= airport_delay_statistics.randomSplit([1.0, 2.0], 24)

In [343]:
split_1.show(3)

+----+--------------------+-------+-------------+--------+-------+
|Code|                Name|Carrier|Late Aircraft|Security|Weather|
+----+--------------------+-------+-------------+--------+-------+
| ATL|Atlanta, GA: Hart...|    776|         1910|       4|    101|
| ATL|Atlanta, GA: Hart...|    875|         1146|       6|     39|
| ATL|Atlanta, GA: Hart...|    880|         1095|       3|     26|
+----+--------------------+-------+-------------+--------+-------+
only showing top 3 rows



In [341]:
split_2.show(3)

+----+--------------------+-------+-------------+--------+-------+
|Code|                Name|Carrier|Late Aircraft|Security|Weather|
+----+--------------------+-------+-------------+--------+-------+
| ATL|Atlanta, GA: Hart...|    700|          807|       4|    161|
| ATL|Atlanta, GA: Hart...|    756|          906|       4|    170|
| ATL|Atlanta, GA: Hart...|    789|          888|       6|    155|
+----+--------------------+-------+-------------+--------+-------+
only showing top 3 rows



In [None]:
# df.replace
# df.repartition, df.repartitionByRange
# df.registerTempTable
# df.rdd
# df.intersect
# df.intersectAll
# df.collect vs df.take vs df.show
# df.hint()
# using explode feature
# createGlobalTempView
# createOrReplaceGlobalTempView
# distinct 
# drop_duplicates
# exceptALL
# fillna
# 
# cube function is most important