In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
from pyspark.sql import SparkSession

In [5]:
from pyspark.sql.functions import count, mean, stddev_pop, min, max, avg, col, desc

In [6]:
from pyspark.sql.types import StringType, IntegerType, StructType, StructField

In [7]:
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [8]:
sc

**2. Data Understanding using SparkSQL**

In [9]:
! wget https://storage.googleapis.com/class25jan2022/share/2008.csv

--2022-01-31 03:34:13--  https://storage.googleapis.com/class25jan2022/share/2008.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.164.144, 172.253.62.128, 172.253.115.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.164.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 689413344 (657M) [text/csv]
Saving to: ‘2008.csv’


2022-01-31 03:34:18 (150 MB/s) - ‘2008.csv’ saved [689413344/689413344]



In [10]:
! wc -l ./2008.csv

7009729 ./2008.csv


In [11]:
! head -3 2008.csv

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
2008,1,3,4,2003,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA
2008,1,3,4,754,735,1002,1000,WN,3231,N772SW,128,145,113,2,19,IAD,TPA,810,5,10,0,,0,NA,NA,NA,NA,NA


In [14]:
raw_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('2008.csv')

In [None]:
raw_df.take(4)

[Row(Year='2008', Month='1', DayofMonth='3', DayOfWeek='4', DepTime='2003', CRSDepTime='1955', ArrTime='2211', CRSArrTime='2225', UniqueCarrier='WN', FlightNum='335', TailNum='N712SW', ActualElapsedTime='128', CRSElapsedTime='150', AirTime='116', ArrDelay='-14', DepDelay='8', Origin='IAD', Dest='TPA', Distance='810', TaxiIn='4', TaxiOut='8', Cancelled='0', CancellationCode=None, Diverted='0', CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA'),
 Row(Year='2008', Month='1', DayofMonth='3', DayOfWeek='4', DepTime='754', CRSDepTime='735', ArrTime='1002', CRSArrTime='1000', UniqueCarrier='WN', FlightNum='3231', TailNum='N772SW', ActualElapsedTime='128', CRSElapsedTime='145', AirTime='113', ArrDelay='2', DepDelay='19', Origin='IAD', Dest='TPA', Distance='810', TaxiIn='5', TaxiOut='10', Cancelled='0', CancellationCode=None, Diverted='0', CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA'),
 Row(Year='20

In [None]:
raw_df.count()

7009728

In [None]:
raw_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

In [None]:
raw_df.describe('DepTime','TaxiOut','TaxiIn','DayOfWeek','Distance','ArrDelay').show()

+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|summary|           DepTime|           TaxiOut|           TaxiIn|         DayOfWeek|         Distance|         ArrDelay|
+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|  count|           7009728|           7009728|          7009728|           7009728|          7009728|          7009728|
|   mean|1333.8300461105448|16.453045177492882|6.860851704974527|3.9241815088973495|726.3870294253928| 8.16845238729114|
| stddev|478.06889486629836|11.332798654232155|4.933649371300466|1.9882589459851212|562.1018034840403|38.50193694882867|
|    min|                 1|                 0|                0|                 1|              100|               -1|
|    max|                NA|                NA|               NA|                 7|              999|               NA|
+-------+------------------+----

In [None]:
raw_df.groupby("Month").count().orderBy(col('count').desc()).show()

+-----+------+
|Month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



In [None]:
raw_df.groupby("Month").count().withColumnRenamed("count","distinct_name").sort(desc("count")).show()

+-----+-------------+
|Month|distinct_name|
+-----+-------------+
|    7|       627931|
|    3|       616090|
|    8|       612279|
|    6|       608665|
|    5|       606293|
|    1|       605765|
|    4|       598126|
|    2|       569236|
|   10|       556205|
|   12|       544958|
|    9|       540908|
|   11|       523272|
+-----+-------------+



In [None]:
df=raw_df.groupBy('Month').count()
df.sort("count",ascending=False).show()

+-----+------+
|Month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



In [None]:
df.sort(df[1].desc()).show()

+-----+------+
|Month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



In [None]:
raw_df.groupBy("Month").agg(count("Month"),avg("ArrDelay")) \
.withColumnRenamed("count(Month)","numMonth").withColumnRenamed("avg(ArrDelay)","avgArrDelay") \
.sort(desc("numMonth")) \
.show()

+-----+--------+------------------+
|Month|numMonth|       avgArrDelay|
+-----+--------+------------------+
|    7|  627931| 9.975049681276131|
|    3|  616090| 11.19236458018227|
|    8|  612279|  6.91091468997087|
|    6|  608665|13.266756009659792|
|    5|  606293| 5.978448290248828|
|    1|  605765|10.188855960349496|
|    4|  598126| 6.807297481094145|
|    2|  569236|13.077836997760205|
|   10|  556205|0.4154954706912698|
|   12|  544958|16.680505081496417|
|    9|  540908|0.6977328787273043|
|   11|  523272| 2.015857969430839|
+-----+--------+------------------+



# Data Cleaning Processes

In [None]:
newdataSchema = StructType([
     StructField('Year', StringType(), True),
     StructField('Month', StringType(), True),
     StructField('DayofMonth', StringType(), True),
     StructField('DayofWeek', StringType(), True),
     StructField('DepTime', StringType(), True),
     StructField('CRSDepTime', StringType(), True),
     StructField('ArrTime', StringType(), True),
     StructField('CRSArrTime', StringType(), True),
     StructField('UniqueCarrier', StringType(), True),
     StructField('FlightNum', StringType(), True),
     StructField('TailNum', StringType(), True),
     StructField('ActualElapedTime', StringType(), True),
     StructField('AirTime', StringType(), True),
     StructField('ArrDelay', IntegerType(), True),
     StructField('DepDelay', IntegerType(), True),
     StructField('Origin', StringType(), True),
     StructField('Dest', StringType(), True),
     StructField('Distance', StringType(), True),
     StructField('TaxiIn', StringType(), True),
     StructField('TaxiOut', StringType(), True),
     StructField('Cancelled', StringType(), True),
     StructField('CancellationCode', StringType(), True),
     StructField('Diverted', StringType(), True),
     StructField('CarrierDelay', StringType(), True),
     StructField('WeatherDelay', StringType(), True),
     StructField('NASDelay', StringType(), True),
     StructField('SecurityDelay', StringType(), True),
     StructField('LateAircraftDelay', StringType(), True)    
])

In [None]:
newraw_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('2008.csv', schema=newdataSchema)

In [None]:
newraw_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayofWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay: string (nullable = true)
 |-- WeatherDelay: 

In [None]:
newraw_df.count()

7009728

In [None]:
newraw_df.describe('DepTime','TaxiOut','TaxiIn','DayOfWeek','Distance','ArrDelay').show()

+-------+------------------+-----------------+-----------------+-----------------+--------+-----------------+
|summary|           DepTime|          TaxiOut|           TaxiIn|        DayOfWeek|Distance|         ArrDelay|
+-------+------------------+-----------------+-----------------+-----------------+--------+-----------------+
|  count|           6855029|          6855029|          6855029|          6855029| 6855029|          6855029|
|   mean|1333.7753760049739|6.860150263405158|728.7438337605865| 3.92580191856227|    null|104.0185891263188|
| stddev|478.05737977980044|4.931223682301741|563.2447305232778|1.988459406764337|    null|67.43979594730754|
|    min|                 1|                0|              100|                1|     ABE|                0|
|    max|               959|               99|              999|                7|     YUM|             1350|
+-------+------------------+-----------------+-----------------+-----------------+--------+-----------------+



In [None]:
newraw_df.groupBy('Month').count().sort(desc('count')).show()

+-----+------+
|Month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



In [None]:
newraw_df.groupBy("Month").agg(count("Month"),avg("ArrDelay")) \
.withColumnRenamed("count(Month)","numMonth").withColumnRenamed("avg(ArrDelay)","avgArrDelay") \
.sort(desc("numMonth")) \
.show()

+-----+--------+------------------+
|Month|numMonth|       avgArrDelay|
+-----+--------+------------------+
|    7|  615423|104.22379079104941|
|    8|  600750|103.78106533499792|
|    5|  599210| 104.3720331770164|
|    3|  598343|106.19393725672398|
|    6|  595458|104.76035757349804|
|    1|  587130|104.61805732972255|
|    4|  586723|104.26938265586998|
|   10|  552071|101.26398235009627|
|    2|  546925|105.31851350733648|
|    9|  530276|  100.878649986045|
|   12|  524747|105.08918393054176|
|   11|  517973|103.00546939705352|
+-----+--------+------------------+



In [None]:
newraw_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

# For Reference Only