In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [6]:
sc

**2. Data Understanding using SparkSQL**

In [7]:
! wget https://storage.googleapis.com/class25jan2022/share/2008.csv

--2022-02-04 15:54:15--  https://storage.googleapis.com/class25jan2022/share/2008.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.98.128, 142.250.97.128, 142.251.107.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.98.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 689413344 (657M) [text/csv]
Saving to: ‘2008.csv’


2022-02-04 15:54:18 (246 MB/s) - ‘2008.csv’ saved [689413344/689413344]



In [8]:
! wc -l ./2008.csv

7009729 ./2008.csv


In [9]:
! head -3 2008.csv

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
2008,1,3,4,2003,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA
2008,1,3,4,754,735,1002,1000,WN,3231,N772SW,128,145,113,2,19,IAD,TPA,810,5,10,0,,0,NA,NA,NA,NA,NA


In [10]:
raw_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('2008.csv')

In [11]:
raw_df.count()

7009728

In [12]:
raw_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

In [13]:
raw_df.describe('DepTime','TaxiOut','TaxiIn','DayOfWeek','Distance','ArrDelay').show()

+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|summary|           DepTime|           TaxiOut|           TaxiIn|         DayOfWeek|         Distance|         ArrDelay|
+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|  count|           7009728|           7009728|          7009728|           7009728|          7009728|          7009728|
|   mean|1333.8300461105448|16.453045177492882|6.860851704974527|3.9241815088973495|726.3870294253928| 8.16845238729114|
| stddev|478.06889486629836|11.332798654232155|4.933649371300466|1.9882589459851212|562.1018034840403|38.50193694882867|
|    min|                 1|                 0|                0|                 1|              100|               -1|
|    max|                NA|                NA|               NA|                 7|              999|               NA|
+-------+------------------+----

In [14]:
raw_df.groupBy('month').count().orderBy('count',ascending=False).show()

+-----+------+
|month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



In [15]:
from pyspark.sql.functions import count,avg
from pyspark.sql import functions as sparkf

raw_df.groupBy('month').agg(count('month').alias('numMonth'),avg('ArrDelay').alias('avgArrDelay')).orderBy(sparkf.col('numMonth').desc(),sparkf.col('avgArrDelay').desc()).show()

+-----+--------+------------------+
|month|numMonth|       avgArrDelay|
+-----+--------+------------------+
|    7|  627931| 9.975049681276131|
|    3|  616090| 11.19236458018227|
|    8|  612279|  6.91091468997087|
|    6|  608665|13.266756009659792|
|    5|  606293| 5.978448290248828|
|    1|  605765|10.188855960349496|
|    4|  598126| 6.807297481094145|
|    2|  569236|13.077836997760205|
|   10|  556205|0.4154954706912698|
|   12|  544958|16.680505081496417|
|    9|  540908|0.6977328787273043|
|   11|  523272| 2.015857969430839|
+-----+--------+------------------+



In [16]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf

In [17]:
air_df = raw_df.\
withColumn('DepTime',raw_df['DepTime'].\
           cast(DoubleType())).\
withColumn('TaxiOut',raw_df['TaxiOut'].\
           cast(DoubleType())).\
withColumn('TaxiIn',raw_df['TaxiIn'].\
           cast(DoubleType())).\
withColumn('DepDelay',raw_df['DepDelay'].\
           cast(DoubleType())).\
withColumn('DayOfWeek',raw_df['DayOfWeek'].\
           cast(DoubleType())).\
withColumn('Distance',raw_df['Distance'].\
           cast(DoubleType())).\
withColumn('ArrDelay',raw_df['ArrDelay'].\
           cast(DoubleType()))

In [18]:
air_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: double (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: double (nullable = true)
 |-- TaxiIn: double (nullable = true)
 |-- TaxiOut: double (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

In [19]:
! pip install pandas



In [20]:
import pandas as pd
pd.DataFrame(air_df.\
             select(['DepTime','TaxiOut','TaxiIn',\
                     'DayOfWeek','Distance','ArrDelay']).\
             describe().take(5),\
             columns=air_df.\
             select(['DepTime','TaxiOut','TaxiIn',\
                     'DayOfWeek','Distance','ArrDelay']).\
             describe().columns).transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
DepTime,6873482,1333.8300461105448,478.06889486629836,1.0,2400.0
TaxiOut,6872670,16.453045177492882,11.332798654232155,0.0,429.0
TaxiIn,6858079,6.860851704974527,4.933649371300466,0.0,308.0
DayOfWeek,7009728,3.9241815088973495,1.9882589459851212,1.0,7.0
Distance,7009728,726.3870294253928,562.1018034840403,11.0,4962.0
ArrDelay,6855029,8.16845238729114,38.50193694882867,-519.0,2461.0


In [24]:
def t_timeperiod(origin):
    if origin is None:
        period = None
    elif origin > 0 and origin < 600:
        period = '00.01-05.59'
    elif origin >= 600 and origin <1200:
        period = '06.00-11.59'
    elif origin >= 1200 and origin < 1800:
        period = '12.00-17.59'
    elif origin >= 1800 and origin <= 2400:
        period = '18.00-24.00'
    else:
        period = 'NA'
    return period

In [25]:
timeframe = udf(lambda x: t_timeperiod(x),StringType())

In [26]:
discretized_df = air_df.\
withColumn('DepTime',timeframe(air_df['DepTime'])).groupBy( 'DepTime'). count(). orderBy ( 'DepTime'). show()

+-----------+-------+
|    DepTime|  count|
+-----------+-------+
|       null| 136246|
|00.01-05.59| 179949|
|06.00-11.59|2643673|
|12.00-17.59|2554672|
|18.00-24.00|1495188|
+-----------+-------+



In [28]:
from pyspark.sql.functions import *


In [34]:
from pyspark.sql.functions import *
max_distance = air_df.select(max('Distance')).collect()[0][0]
min_distance = air_df.select(min('Distance')).collect()[0][0]

In [35]:
max_ArrDelay = air_df.select(max('ArrDelay')).collect()[0][0]
min_ArrDelay = air_df.select(min('ArrDelay')).collect()[0][0]

In [36]:
def t_normalized_distance(origin):
    if origin is None:
        return None
    else:
        return ((origin-min_distance)/(max_distance-min_distance))

In [37]:
def t_normalized_ArrDelay(origin):
    if origin is None:
        return None
    else:
        return ((origin-min_ArrDelay)/(max_ArrDelay-min_ArrDelay))

In [38]:
normalized_distance = udf(lambda x: t_normalized_distance(x),DoubleType())


In [39]:
normalized_ArrDelay = udf(lambda x: t_normalized_ArrDelay(x),DoubleType())


In [43]:
normalized_df = air_df.\
withColumn('Distance', normalized_distance(air_df['Distance'])).\
withColumn('ArrDelay', normalized_ArrDelay(air_df['ArrDelay']))

In [44]:
features_df = normalized_df.\
select(['UniqueCarrier','Origin','Dest',\
        'DepTime','TaxiOut','TaxiIn','DepDelay',\
        'DayOfWeek','Distance','ArrDelay'])


In [45]:
final_df = features_df.dropna()


In [46]:
final_df.count()


6855029

In [47]:
final_df.show()


+-------------+------+----+-------+-------+------+--------+---------+--------------------+-------------------+
|UniqueCarrier|Origin|Dest|DepTime|TaxiOut|TaxiIn|DepDelay|DayOfWeek|            Distance|           ArrDelay|
+-------------+------+----+-------+-------+------+--------+---------+--------------------+-------------------+
|           WN|   IAD| TPA| 2003.0|    8.0|   4.0|     8.0|      4.0| 0.16138153908301353|0.16946308724832215|
|           WN|   IAD| TPA|  754.0|   10.0|   5.0|    19.0|      4.0| 0.16138153908301353|0.17483221476510066|
|           WN|   IND| BWI|  628.0|   17.0|   3.0|     8.0|      4.0|  0.1017976166431024|0.17885906040268457|
|           WN|   IND| BWI|  926.0|    7.0|   3.0|    -4.0|      4.0|  0.1017976166431024|0.17214765100671142|
|           WN|   IND| BWI| 1829.0|   10.0|   3.0|    34.0|      4.0|  0.1017976166431024| 0.1855704697986577|
|           WN|   IND| JAX| 1940.0|   10.0|   4.0|    25.0|      4.0|  0.1367400525146435|0.17785234899328858|
|