## The following section is for Colab Users.
### Just run the following code cells

In [None]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://bitbucket.org/habedi/datasets/raw/b6769c4664e7ff68b001e2f43bc517888cbe3642/spark/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!rm -rf spark-3.0.2-bin-hadoop2.7.tgz*
!pip -q install findspark pyspark graphframes

In [None]:
!wget https://repos.spark-packages.org/graphframes/graphframes/0.8.2-spark3.0-s_2.12/graphframes-0.8.2-spark3.0-s_2.12.jar -P /content/spark-3.0.2-bin-hadoop2.7/jars/
!cp /content/spark-3.0.2-bin-hadoop2.7/jars/graphframes-0.8.2-spark3.0-s_2.12.jar /content/spark-3.0.2-bin-hadoop2.7/graphframes-0.8.2-spark3.0-s_2.12.zip

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = os.environ["SPARK_HOME"]

os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "notebook"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

In [None]:
import findspark
findspark.init()

In [None]:
!export PYSPARK_SUBMIT_ARGS="--master local[*] pyspark-shell"
!export PYSPARK_DRIVER_PYTHON=jupyter
!export PYSPARK_DRIVER_PYTHON_OPTS=notebook

In [None]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

In [None]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell"

**************************************************************************
**************************************************************************
**************************************************************************

In [None]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Read departuredelays.csv in Edge DataFrame
### Read airport-codes-na.txt in Vertix DataFrame (the separator is Tab i.e sep = '\t' )

In [None]:
import pyspark

pyspark.__version__

'3.2.1'

In [None]:
from pyspark.sql import SparkSession
from graphframes import *

spark = SparkSession.builder.master("local[*]").appName("GraphFrames").getOrCreate()

22/07/13 21:10:52 WARN Utils: Your hostname, mohamed-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
22/07/13 21:10:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/13 21:10:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
Vertix = spark.read.options(delimiter = '\t').csv('airport-codes-na.txt',inferSchema = True, header = True)

In [None]:
Edge = spark.read.csv('departuredelays.csv' ,inferSchema = True, header = True)

                                                                                

#### The US flight delays data set has five columns:
- The <b>date</b> column contains an integer like 02190925 . When converted, this maps to 02-19 09:25 am.
- The <b>delay</b> column gives the delay in minutes between the scheduled and actual departure times. Early departures show negative numbers.
- The <b>distance</b> column gives the distance in miles from the origin airport to the destination airport.
- The <b>origin</b> column contains the origin IATA airport code.
- The <b>destination</b> column contains the destination IATA airport code.

#### The airport-codes data set has four columns:
- The <b>IATA</b> column contains IATA airport code.
- The <b>City, State, and Country</b> columns contains information about the airport location. 

In [None]:
Vertix.show(5)

+----------+-----+-------+----+
|      City|State|Country|IATA|
+----------+-----+-------+----+
|Abbotsford|   BC| Canada| YXX|
|  Aberdeen|   SD|    USA| ABR|
|   Abilene|   TX|    USA| ABI|
|     Akron|   OH|    USA| CAK|
|   Alamosa|   CO|    USA| ALS|
+----------+-----+-------+----+
only showing top 5 rows



### In the vertix DataFrame, drop any duplicated rows with the same  IATA code.

In [None]:
Vertix = Vertix.drop_duplicates(['IATA'])

### In the edges DataFrame:
- Rename the <b>date</b> columns to become <b>tripid</b>.
- Rename the <b>origin</b> columns to become <b>src</b>.
- Rename the <b>destination</b> columns to become <b>dst</b>.

In [None]:
Edge.show(5)

+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
+-------+-----+--------+------+-----------+
only showing top 5 rows



In [None]:
Edge = Edge.withColumnRenamed('data', 'tripid').withColumnRenamed('origin', 'src').withColumnRenamed('destination', 'dst')

### In the Vertix DataFrame:
- Rename the <b>IATA</b> columns to become <b>id</b>.

In [None]:
Vertix = Vertix.withColumnRenamed('IATA', 'id')

### Create GraphFrame from Vertix and Edges DataFrames

In [None]:
Vertix.show(5)

+-----------+-----+-------+---+
|       City|State|Country| id|
+-----------+-----+-------+---+
|  Allentown|   PA|    USA|ABE|
|    Abilene|   TX|    USA|ABI|
|Albuquerque|   NM|    USA|ABQ|
|   Aberdeen|   SD|    USA|ABR|
|     Albany|   GA|    USA|ABY|
+-----------+-----+-------+---+
only showing top 5 rows



In [None]:
Edge.show(5)

+-------+-----+--------+---+---+
|   date|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



In [None]:
gf = GraphFrame(Vertix, Edge)

In [None]:
gf.vertices.show(5)

+-----------+-----+-------+---+
|       City|State|Country| id|
+-----------+-----+-------+---+
|  Allentown|   PA|    USA|ABE|
|    Abilene|   TX|    USA|ABI|
|Albuquerque|   NM|    USA|ABQ|
|   Aberdeen|   SD|    USA|ABR|
|     Albany|   GA|    USA|ABY|
+-----------+-----+-------+---+
only showing top 5 rows



### Determine the number of airports

In [None]:
gf.vertices.select('id').count()

524

### Determine the number of trips 

In [None]:
gf.edges.count()

                                                                                

1391578

### What is the longest delay?

In [None]:
gf.edges.agg({'delay': 'max'}).show()

+----------+
|max(delay)|
+----------+
|      1642|
+----------+



                                                                                

### Find out the number of delayed flights vs. early flights (flights that departed before actual time)

In [None]:
gf.edges.show(5)

+-------+-----+--------+---+---+
|   date|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



In [None]:
print(f"Delayed flights: {gf.edges.select('delay').where(gf.edges.delay > 0).count()}\nvs\nEarly Flights: {gf.edges.select('delay').where(gf.edges.delay < 0).count()}")

                                                                                

Delayed flights: 591727
vs
Early Flights: 668729


### What flight destinations departing SFO are most likely to have significant delays? Select the top 10
#### Hint: you should get the average delay for each destination for trips that depart from SFO only

In [None]:
from pyspark.sql.functions import avg, col

gf.edges.select(['src', 'dst', 'delay']) \
        .where(gf.edges.src == 'SFO') \
        .groupBy('dst') \
        .agg(avg('delay').alias('Average Delay')) \
        .sort(col('Average Delay').desc()) \
        .select('dst') \
        .show(10)



+---+
|dst|
+---+
|JAC|
|OKC|
|SUN|
|COS|
|SAT|
|STL|
|HNL|
|ASE|
|CEC|
|MDW|
+---+
only showing top 10 rows



                                                                                

### Find the Incoming connections to the airport sorted in Desc. order.

In [None]:
gf.inDegrees.sort(col('inDegree').desc()).show()



+---+--------+
| id|inDegree|
+---+--------+
|ATL|   90434|
|DFW|   66050|
|ORD|   61967|
|LAX|   53601|
|DEN|   50921|
|IAH|   42700|
|PHX|   39721|
|SFO|   38988|
|LAS|   32994|
|CLT|   28388|
|MCO|   27959|
|EWR|   27652|
|LGA|   25469|
|BOS|   25360|
|SLC|   25323|
|JFK|   23484|
|DTW|   23310|
|SEA|   23074|
|MSP|   22385|
|MIA|   21805|
+---+--------+
only showing top 20 rows



                                                                                

### Find the Outgoing connections from the airport sorted in Desc. order.

In [None]:
gf.outDegrees.sort(col('outDegree').desc()).show()

[Stage 38:>                                                         (0 + 4) / 4]

+---+---------+
| id|outDegree|
+---+---------+
|ATL|    91484|
|DFW|    68482|
|ORD|    64228|
|LAX|    54086|
|DEN|    53148|
|IAH|    43361|
|PHX|    40155|
|SFO|    39483|
|LAS|    33107|
|CLT|    28402|
|MCO|    28313|
|EWR|    27656|
|SLC|    25868|
|LGA|    25458|
|BOS|    25348|
|MSP|    24031|
|JFK|    23572|
|DTW|    23421|
|SEA|    23078|
|MIA|    21817|
+---+---------+
only showing top 20 rows



                                                                                

### Use motif finding to answer this question: which delays could we blame on SFO?
#### Hint: this practically means that SFO is a transit station

In [None]:
motifs = gf.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)")
motifs.show(5)



+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  e1|                  v2|                  e2|                  v3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|{Dallas, TX, USA,...|{1011810, -4, 78,...|{Waco, TX, USA, ACT}|{1011920, -8, 78,...|{Dallas, TX, USA,...|
|{Dallas, TX, USA,...|{1011810, -4, 78,...|{Waco, TX, USA, ACT}|{1011130, -7, 78,...|{Dallas, TX, USA,...|
|{Dallas, TX, USA,...|{1011810, -4, 78,...|{Waco, TX, USA, ACT}|{1011720, 18, 78,...|{Dallas, TX, USA,...|
|{Dallas, TX, USA,...|{1011810, -4, 78,...|{Waco, TX, USA, ACT}|{1011430, -5, 78,...|{Dallas, TX, USA,...|
|{Dallas, TX, USA,...|{1011810, -4, 78,...|{Waco, TX, USA, ACT}|{1010610, -1, 78,...|{Dallas, TX, USA,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [None]:
motifs.printSchema()

root
 |-- v1: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- id: string (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- date: integer (nullable = true)
 |    |-- delay: integer (nullable = true)
 |    |-- distance: integer (nullable = true)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |-- v2: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Country: string (nullable = true)
 |    |-- id: string (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- date: integer (nullable = true)
 |    |-- delay: integer (nullable = true)
 |    |-- distance: integer (nullable = true)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |-- v3: struct (nullable = false)
 |    |-- City: string (nullable = true)
 |    |-- State: string (

In [None]:
motifs.filter("v2.id == 'SFO'").show(5)

                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  v1|                  e1|                  v2|                  e2|                  v3|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|{Albuquerque, NM,...|{1010600, -7, 779...|{San Francisco, C...|{1011250, 55, 224...|{New York, NY, US...|
|{Albuquerque, NM,...|{1010600, -7, 779...|{San Francisco, C...|{1012230, 0, 2247...|{New York, NY, US...|
|{Albuquerque, NM,...|{1010600, -7, 779...|{San Francisco, C...|{1010705, -7, 224...|{New York, NY, US...|
|{Albuquerque, NM,...|{1010600, -7, 779...|{San Francisco, C...|{1010620, -3, 224...|{Miami, FL, USA, ...|
|{Albuquerque, NM,...|{1010600, -7, 779...|{San Francisco, C...|{1010915, -3, 293...|{Los Angeles, CA,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



### Determine Airport Ranking in Desc. order using PageRank algorithm

In [None]:
results = gf.pageRank(resetProbability = 0.9, maxIter = 5)

                                                                                

In [None]:
results.vertices.sort(col('pagerank').desc()).show(5)

                                                                                

+--------------+-----+-------+---+------------------+
|          City|State|Country| id|          pagerank|
+--------------+-----+-------+---+------------------+
|       Atlanta|   GA|    USA|ATL| 4.810793946480082|
|       Chicago|   IL|    USA|ORD|  3.68319369373655|
|        Dallas|   TX|    USA|DFW|3.5856509969909705|
|        Denver|   CO|    USA|DEN|  2.49930002859287|
|Salt Lake City|   UT|    USA|SLC| 2.270344538494024|
+--------------+-----+-------+---+------------------+
only showing top 5 rows



## Determine the most popular flights (single city hops)

In [None]:
gf.edges.show(5)

+-------+-----+--------+---+---+
|   date|delay|distance|src|dst|
+-------+-----+--------+---+---+
|1011245|    6|     602|ABE|ATL|
|1020600|   -8|     369|ABE|DTW|
|1021245|   -2|     602|ABE|ATL|
|1020605|   -4|     602|ABE|ATL|
|1031245|   -4|     602|ABE|ATL|
+-------+-----+--------+---+---+
only showing top 5 rows



In [None]:
gf.edges.groupBy(['src', 'dst']).count().sort(col('count').desc()).show(5)

+---+---+-----+
|src|dst|count|
+---+---+-----+
|SFO|LAX| 3232|
|LAX|SFO| 3198|
|LAS|LAX| 3016|
|LAX|LAS| 2964|
|JFK|LAX| 2720|
+---+---+-----+
only showing top 5 rows



                                                                                

### Find and Save a Subragph that obtained from the following pattern:
#### The flight starts from an airport and return back to the same airport through 2 other airports.