In [1]:
!pip install graphframes



In [2]:
!unzip -q /content/drive/MyDrive/Data.zip

replace Data/airport-codes-na.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

In [5]:
spark = SparkSession.builder.config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12").getOrCreate()

In [6]:
e = spark.read.csv("/content/Data/departuredelays.csv", header=True)
v = spark.read.csv("/content/Data/airport-codes-na.txt", header=True , sep='\t')

In [7]:
e.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



In [8]:
v.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location.

### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [9]:
v = v.dropDuplicates(['IATA'])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [10]:
e = e.withColumnRenamed('date','tripid')
e = e.withColumnRenamed('origin','src')
e = e.withColumnRenamed('destination','dst')

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [11]:
v = v.withColumnRenamed('IATA','id')

### Create GraphFrame from Vertix and Edges DataFrames

In [12]:
gf = GraphFrame(v,e)



In [13]:
gf.vertices.show(5)

+-----------+-----+-------+---+
|       City|State|Country| id|
+-----------+-----+-------+---+
|  Allentown|   PA|    USA|ABE|
|    Abilene|   TX|    USA|ABI|
|Albuquerque|   NM|    USA|ABQ|
|   Aberdeen|   SD|    USA|ABR|
|     Albany|   GA|    USA|ABY|
+-----------+-----+-------+---+
only showing top 5 rows



In [14]:
gf.edges.show(5)

+--------+-----+--------+---+---+
|  tripid|delay|distance|src|dst|
+--------+-----+--------+---+---+
|01011245|    6|     602|ABE|ATL|
|01020600|   -8|     369|ABE|DTW|
|01021245|   -2|     602|ABE|ATL|
|01020605|   -4|     602|ABE|ATL|
|01031245|   -4|     602|ABE|ATL|
+--------+-----+--------+---+---+
only showing top 5 rows



### Determine the number of airports

In [15]:
no_of_airports = gf.vertices.count()
print(f"Number of airports: {no_of_airports}")

Number of airports: 524


### Determine the number of trips

In [16]:
no_of_trips = gf.edges.count()
print(f"Number of trips: {no_of_trips}")

Number of trips: 1391578


### What is the longest delay?

In [17]:
longest_delay = gf.edges.agg({"delay": "max"}).collect()[0][0]
print(f"Longest delay: {longest_delay}")

Longest delay: 995


### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [18]:
no_of_delayed_flights = gf.edges.filter(gf.edges.delay > 0).count()
no_of_early_flights = gf.edges.filter(gf.edges.delay < 0).count()

print(f"Number of delayed flights: {no_of_delayed_flights}")
print(f"Number of early flights: {no_of_early_flights}")

print(f"Percentage of delayed flights: {no_of_delayed_flights/no_of_trips*100}%")
print(f"Percentage of early flights: {no_of_early_flights/no_of_trips*100}%")

Number of delayed flights: 591727
Number of early flights: 668729
Percentage of delayed flights: 42.52201457625803%
Percentage of early flights: 48.055444969667526%


### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [19]:
avg_delay_sfo = gf.edges.filter(gf.edges.src == 'SFO').groupBy('dst').agg({'delay': 'avg'}).orderBy('avg(delay)', ascending=False)
avg_delay_sfo.show(10)

+---+------------------+
|dst|        avg(delay)|
+---+------------------+
|JAC| 30.78846153846154|
|OKC|24.822222222222223|
|SUN|22.696629213483146|
|COS| 22.58888888888889|
|SAT|             22.16|
|STL|         20.203125|
|HNL|19.982608695652175|
|ASE|19.846153846153847|
|CEC|19.089820359281436|
|MDW|18.771929824561404|
+---+------------------+
only showing top 10 rows



### Find the Incoming connections to the airport sorted in Desc. order.

In [20]:
incoming_connections = gf.inDegrees.orderBy('inDegree', ascending=False)
incoming_connections.show(5)



+---+--------+
| id|inDegree|
+---+--------+
|ATL|   90434|
|DFW|   66050|
|ORD|   61967|
|LAX|   53601|
|DEN|   50921|
+---+--------+
only showing top 5 rows



### Find the Outgoing connections from the airport sorted in Desc. order.

In [21]:
outgoing_connections = gf.outDegrees.orderBy('outDegree', ascending=False)
outgoing_connections.show(5)

+---+---------+
| id|outDegree|
+---+---------+
|ATL|    91484|
|DFW|    68482|
|ORD|    64228|
|LAX|    54086|
|DEN|    53148|
+---+---------+
only showing top 5 rows



### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [22]:
sfo_transit_paths = gf.find("(a)-[e1]->(sfo); (sfo)-[e2]->(c)").filter("sfo.id = 'SFO'")
sfo_transit_delays = sfo_transit_paths.filter("(e1.delay > 0) OR (e2.delay > 0)")
print("Delays that could be blamed on SFO as a transit station:")
sfo_transit_delays.selectExpr("a.id as origin", "sfo.id as transit", "c.id as destination", "e1.delay as first_segment_delay", "e2.delay as second_segment_delay").show()

Delays that could be blamed on SFO as a transit station:
+------+-------+-----------+-------------------+--------------------+
|origin|transit|destination|first_segment_delay|second_segment_delay|
+------+-------+-----------+-------------------+--------------------+
|   ABQ|    SFO|        JFK|                 -7|                  55|
|   ABQ|    SFO|        DFW|                 -7|                 134|
|   ABQ|    SFO|        ORD|                 -7|                  32|
|   ABQ|    SFO|        DFW|                 -7|                   3|
|   ABQ|    SFO|        ORD|                 -7|                 124|
|   ABQ|    SFO|        LAX|                 -7|                 139|
|   ABQ|    SFO|        JFK|                 -7|                 133|
|   ABQ|    SFO|        ORD|                 -7|                 113|
|   ABQ|    SFO|        LAX|                 -7|                   8|
|   ABQ|    SFO|        MIA|                 -7|                  18|
|   ABQ|    SFO|        DFW|     

### Determine Airport Ranking in Desc. order using PageRank algorithm

In [23]:
pagerank_results = gf.pageRank(resetProbability=0.15, maxIter=10)
airport_ranking = pagerank_results.vertices.orderBy("pagerank", ascending=False)
print("Airport Ranking based on PageRank:")
airport_ranking.show()

Airport Ranking based on PageRank:
+--------------+-----+-------+---+------------------+
|          City|State|Country| id|          pagerank|
+--------------+-----+-------+---+------------------+
|       Atlanta|   GA|    USA|ATL|29.615367151902028|
|        Dallas|   TX|    USA|DFW|21.412549106054627|
|       Chicago|   IL|    USA|ORD|20.784764668927355|
|        Denver|   CO|    USA|DEN|15.525851214594773|
|   Los Angeles|   CA|    USA|LAX|14.240991985905305|
|       Houston|   TX|    USA|IAH|12.566621320447185|
| San Francisco|   CA|    USA|SFO|11.258453926970866|
|       Phoenix|   AZ|    USA|PHX| 10.55374157361721|
|Salt Lake City|   UT|    USA|SLC| 9.330416999448307|
|     Las Vegas|   NV|    USA|LAS| 8.534780587472302|
|       Seattle|   WA|    USA|SEA| 7.370610900510238|
|       Orlando|   FL|    USA|MCO| 7.190221731843621|
|     Charlotte|   NC|    USA|CLT| 7.181012806544953|
|        Newark|   NJ|    USA|EWR|7.1269623643872615|
|       Detroit|   MI|    USA|DTW| 6.8458660708

## Determine the most popular flights (single city hops)

In [24]:
from pyspark.sql.functions import count


In [25]:
popular_flights = gf.edges.groupBy("src", "dst").agg(count("*").alias("flight_count"))
most_popular_flights = popular_flights.orderBy("flight_count", ascending=False)
print("Most popular single city hops:")
most_popular_flights.show()

Most popular single city hops:
+---+---+------------+
|src|dst|flight_count|
+---+---+------------+
|SFO|LAX|        3232|
|LAX|SFO|        3198|
|LAS|LAX|        3016|
|LAX|LAS|        2964|
|JFK|LAX|        2720|
|LAX|JFK|        2719|
|ATL|LGA|        2501|
|LGA|ATL|        2500|
|LAX|PHX|        2394|
|PHX|LAX|        2387|
|HNL|OGG|        2380|
|OGG|HNL|        2379|
|LAX|SAN|        2215|
|SAN|LAX|        2214|
|SJC|LAX|        2208|
|LAX|SJC|        2201|
|ATL|MCO|        2136|
|MCO|ATL|        2090|
|JFK|SFO|        2084|
|SFO|JFK|        2084|
+---+---+------------+
only showing top 20 rows



### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.

In [26]:
small_edges = gf.edges.limit(1000)
small_vertices = gf.vertices.limit(1000)
small_graph = GraphFrame(small_vertices, small_edges)

round_trip_pattern = small_graph.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)")
round_trip_pattern.show()


+---+---+---+---+---+---+
|  a| e1|  b| e2|  c| e3|
+---+---+---+---+---+---+
+---+---+---+---+---+---+

