In [1]:
# import findspark
# findspark.init()
import pyspark # Call this only after findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

df = spark.read.csv('Jan_2019_ontime.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- OP_CARRIER_AIRLINE_ID: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST_AIRPORT_SEQ_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DEL15: integer (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- ARR_DEL15: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- DIVERTED: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)



In [2]:
df_vertices = df.select('ORIGIN', 'ORIGIN_AIRPORT_ID').dropDuplicates(['ORIGIN']).withColumnRenamed('ORIGIN', 'id')

In [3]:
df_edges = df.withColumnRenamed('ORIGIN', 'src').withColumnRenamed('DEST', 'dst')

In [4]:
from graphframes import *

g = GraphFrame(df_vertices, df_edges)

ModuleNotFoundError: No module named 'graphframes'

In [10]:
factor_more_than_1 = g.edges.filter('DEP_DEL15 == 1')
df3 = factor_more_than_1.select('src', 'DEP_DEL15').distinct().orderBy('src', ascending=False)
factor_more_than_12 = g.edges.filter('ARR_DEL15 == 1')
df4 = factor_more_than_12.select('dst', 'ARR_DEL15').distinct().orderBy('dst', ascending=False)

In [11]:
inner = df4.join(df3, df3.src == df4.dst)
inner.show()

                                                                                

+---+---------+---+---------+
|dst|ARR_DEL15|src|DEP_DEL15|
+---+---------+---+---------+
|SMX|        1|SMX|        1|
|RHI|        1|RHI|        1|
|CAK|        1|CAK|        1|
|HPN|        1|HPN|        1|
|GUC|        1|GUC|        1|
|LAN|        1|LAN|        1|
|MBS|        1|MBS|        1|
|RAP|        1|RAP|        1|
|RNO|        1|RNO|        1|
|OTZ|        1|OTZ|        1|
|IMT|        1|IMT|        1|
|BLV|        1|BLV|        1|
|CDV|        1|CDV|        1|
|LEX|        1|LEX|        1|
|LGA|        1|LGA|        1|
|PNS|        1|PNS|        1|
|IAD|        1|IAD|        1|
|JMS|        1|JMS|        1|
|MSP|        1|MSP|        1|
|PHX|        1|PHX|        1|
+---+---------+---+---------+
only showing top 20 rows



In [12]:
g.edges.groupBy('dst').count().orderBy('count', ascending=False).show()

[Stage 8:>                                                          (0 + 4) / 4]

+---+-----+
|dst|count|
+---+-----+
|ATL|31151|
|ORD|26212|
|DFW|23078|
|CLT|19105|
|DEN|18498|
|LAX|17977|
|PHX|14764|
|IAH|14586|
|LGA|13882|
|SFO|13702|
|LAS|13219|
|MSP|12184|
|DTW|12160|
|MCO|12042|
|DCA|11851|
|BOS|11429|
|EWR|10536|
|JFK|10483|
|SEA|10227|
|SLC| 9360|
+---+-----+
only showing top 20 rows



                                                                                

In [13]:
g.edges.groupBy('src').count().orderBy('count', ascending=False).show()

+---+-----+
|src|count|
+---+-----+
|ATL|31155|
|ORD|26216|
|DFW|23063|
|CLT|19100|
|DEN|18507|
|LAX|17988|
|PHX|14761|
|IAH|14598|
|LGA|13872|
|SFO|13689|
|LAS|13209|
|MSP|12180|
|DTW|12172|
|MCO|12045|
|DCA|11839|
|BOS|11430|
|EWR|10522|
|JFK|10485|
|SEA|10230|
|SLC| 9339|
+---+-----+
only showing top 20 rows





In [14]:
g.degrees.orderBy('degree', ascending=False).show(3)

[Stage 14:>                                                         (0 + 4) / 4]

+---+------+
| id|degree|
+---+------+
|ATL| 62306|
|ORD| 52428|
|DFW| 46141|
+---+------+
only showing top 3 rows



                                                                                

In [15]:
distance_greater = g.edges.select('src', 'dst', 'DISTANCE').filter('DISTANCE > 500').distinct().orderBy('DISTANCE', ascending=False)
distance_greater.show()



+---+---+--------+
|src|dst|DISTANCE|
+---+---+--------+
|HNL|JFK|    4983|
|JFK|HNL|    4983|
|HNL|EWR|    4962|
|EWR|HNL|    4962|
|IAD|HNL|    4817|
|HNL|IAD|    4817|
|HNL|ATL|    4502|
|ATL|HNL|    4502|
|ORD|HNL|    4243|
|HNL|ORD|    4243|
|OGG|ORD|    4184|
|ORD|OGG|    4184|
|MSP|HNL|    3972|
|HNL|MSP|    3972|
|IAH|HNL|    3904|
|HNL|IAH|    3904|
|HNL|GUM|    3801|
|GUM|HNL|    3801|
|DFW|HNL|    3784|
|HNL|DFW|    3784|
+---+---+--------+
only showing top 20 rows



                                                                                

In [16]:
from pyspark.sql.functions import desc
distance = g.edges.select('src', 'dst', 'DISTANCE').distinct().sort(desc('DISTANCE'))
distance.show()

+---+---+--------+
|src|dst|DISTANCE|
+---+---+--------+
|HNL|JFK|    4983|
|JFK|HNL|    4983|
|HNL|EWR|    4962|
|EWR|HNL|    4962|
|IAD|HNL|    4817|
|HNL|IAD|    4817|
|HNL|ATL|    4502|
|ATL|HNL|    4502|
|ORD|HNL|    4243|
|HNL|ORD|    4243|
|OGG|ORD|    4184|
|ORD|OGG|    4184|
|MSP|HNL|    3972|
|HNL|MSP|    3972|
|IAH|HNL|    3904|
|HNL|IAH|    3904|
|HNL|GUM|    3801|
|GUM|HNL|    3801|
|DFW|HNL|    3784|
|HNL|DFW|    3784|
+---+---+--------+
only showing top 20 rows



[Stage 20:>                                                         (0 + 4) / 4]                                                                                

In [17]:
import pyspark.sql.functions as F
sub = GraphFrame(g.vertices, distance)
r = sub.find("(a)-[ab]->(b); (b)-[bc]->(c); !(a)-[]->(c)").filter('c.id != a.id')
r2 = r.withColumn("sum_distance", r.ab.DISTANCE + r.bc.DISTANCE).groupby('a.id','c.id').max('sum_distance').sort(desc('max(sum_distance)')).show()




+---+---+-----------------+
| id| id|max(sum_distance)|
+---+---+-----------------+
|EWR|JFK|             9945|
|JFK|EWR|             9945|
|JFK|IAH|             8887|
|IAH|JFK|             8887|
|JFK|GUM|             8784|
|GUM|JFK|             8784|
|GUM|EWR|             8763|
|EWR|GUM|             8763|
|IAD|GUM|             8618|
|GUM|IAD|             8618|
|GUM|ATL|             8303|
|ATL|GUM|             8303|
|ORD|GUM|             8044|
|GUM|ORD|             8044|
|GUM|MSP|             7773|
|MSP|GUM|             7773|
|JFK|ANC|             7760|
|ANC|JFK|             7760|
|EWR|ANC|             7739|
|ANC|EWR|             7739|
+---+---+-----------------+
only showing top 20 rows



                                                                                

In [19]:
r.count()

                                                                                

222753

In [18]:
sub2 = GraphFrame(g.vertices, distance)
result = sub2.find("(a)-[ab]->(b); (b)-[bc]->(c); !(a)-[]->(c)").filter("a.id = 'JFK'").filter("c.id = 'EWR'")
result.show(100)

                                                                                

+------------+----------------+------------+----------------+------------+
|           a|              ab|           b|              bc|           c|
+------------+----------------+------------+----------------+------------+
|{JFK, 12478}| {JFK, CLE, 425}|{CLE, 11042}| {CLE, EWR, 404}|{EWR, 11618}|
|{JFK, 12478}|{JFK, STT, 1623}|{STT, 15024}|{STT, EWR, 1634}|{EWR, 11618}|
|{JFK, 12478}|{JFK, EGE, 1746}|{EGE, 11503}|{EGE, EWR, 1725}|{EWR, 11618}|
|{JFK, 12478}| {JFK, RDU, 427}|{RDU, 14492}| {RDU, EWR, 416}|{EWR, 11618}|
|{JFK, 12478}|{JFK, PBI, 1028}|{PBI, 14027}|{PBI, EWR, 1023}|{EWR, 11618}|
|{JFK, 12478}|{JFK, MSY, 1182}|{MSY, 13495}|{MSY, EWR, 1167}|{EWR, 11618}|
|{JFK, 12478}|{JFK, JAC, 1894}|{JAC, 12441}|{JAC, EWR, 1874}|{EWR, 11618}|
|{JFK, 12478}| {JFK, ROC, 264}|{ROC, 14576}| {ROC, EWR, 246}|{EWR, 11618}|
|{JFK, 12478}| {JFK, ORF, 290}|{ORF, 13931}| {ORF, EWR, 284}|{EWR, 11618}|
|{JFK, 12478}|{JFK, SJC, 2569}|{SJC, 14831}|{SJC, EWR, 2548}|{EWR, 11618}|
|{JFK, 12478}| {JFK, ORD,