In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [7]:
sc

**2. Data Understanding using SparkSQL**

In [8]:
! wget https://storage.googleapis.com/class25jan2022/share/2008.csv

--2022-01-30 13:22:07--  https://storage.googleapis.com/class25jan2022/share/2008.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.128, 74.125.199.128, 74.125.20.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 689413344 (657M) [text/csv]
Saving to: ‘2008.csv’


2022-01-30 13:22:10 (212 MB/s) - ‘2008.csv’ saved [689413344/689413344]



In [9]:
! wc -l ./2008.csv

7009729 ./2008.csv


In [10]:
! head -3 2008.csv

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
2008,1,3,4,2003,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA
2008,1,3,4,754,735,1002,1000,WN,3231,N772SW,128,145,113,2,19,IAD,TPA,810,5,10,0,,0,NA,NA,NA,NA,NA


In [11]:
raw_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('2008.csv')

In [12]:
raw_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

In [13]:
from pyspark.sql.functions import *

In [14]:
raw_df.count()

7009728

In [15]:
raw_df.show(5)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|2008|    1|         3|        4|   2003|      1955|   2211|      2225|           WN|      335

In [16]:
raw_df.head

<bound method DataFrame.head of DataFrame[Year: string, Month: string, DayofMonth: string, DayOfWeek: string, DepTime: string, CRSDepTime: string, ArrTime: string, CRSArrTime: string, UniqueCarrier: string, FlightNum: string, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: string, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: string, TaxiIn: string, TaxiOut: string, Cancelled: string, CancellationCode: string, Diverted: string, CarrierDelay: string, WeatherDelay: string, NASDelay: string, SecurityDelay: string, LateAircraftDelay: string]>

In [17]:
raw_df.drop('Year', 'Month', 'DayofMonth', 'CRSDepTime', 'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','DepDelay','Origin','Dest','Distance','Cancelled','CancellationCode','Diverted',
            'CarrierDelay','WeatherDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay','CRSArrTime','ArrTime').show(5) 

+---------+-------+--------+------+-------+
|DayOfWeek|DepTime|ArrDelay|TaxiIn|TaxiOut|
+---------+-------+--------+------+-------+
|        4|   2003|     -14|     4|      8|
|        4|    754|       2|     5|     10|
|        4|    628|      14|     3|     17|
|        4|    926|      -6|     3|      7|
|        4|   1829|      34|     3|     10|
+---------+-------+--------+------+-------+
only showing top 5 rows



In [18]:
raw_df_drop = raw_df.drop('Year', 'Month', 'DayofMonth', 'CRSDepTime', 'UniqueCarrier','FlightNum','TailNum','ActualElapsedTime','CRSElapsedTime','AirTime','DepDelay','Origin','Dest','Distance','Cancelled','CancellationCode','Diverted',
            'CarrierDelay','WeatherDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay','CRSArrTime','ArrTime')

In [19]:
raw_df_drop.count()

7009728

In [20]:
raw_df_drop.columns

['DayOfWeek', 'DepTime', 'ArrDelay', 'TaxiIn', 'TaxiOut']

In [21]:
raw_df_drop.describe().show()

+-------+------------------+------------------+-----------------+-----------------+------------------+
|summary|         DayOfWeek|           DepTime|         ArrDelay|           TaxiIn|           TaxiOut|
+-------+------------------+------------------+-----------------+-----------------+------------------+
|  count|           7009728|           7009728|          7009728|          7009728|           7009728|
|   mean|3.9241815088973495|1333.8300461105448| 8.16845238729114|6.860851704974527|16.453045177492882|
| stddev|1.9882589459851212|478.06889486629836|38.50193694882867|4.933649371300466|11.332798654232155|
|    min|                 1|                 1|               -1|                0|                 0|
|    max|                 7|                NA|               NA|               NA|                NA|
+-------+------------------+------------------+-----------------+-----------------+------------------+



In [22]:
raw_df.groupBy('Month').count().show()

+-----+------+
|Month| count|
+-----+------+
|    7|627931|
|   11|523272|
|    3|616090|
|    8|612279|
|    5|606293|
|    6|608665|
|    9|540908|
|    1|605765|
|   10|556205|
|    4|598126|
|   12|544958|
|    2|569236|
+-----+------+



In [23]:
raw_df_sort = raw_df.groupBy('Month').count()

In [29]:
raw_df_sort.sort(raw_df_sort.Month.desc()).show()

+-----+------+
|Month| count|
+-----+------+
|    9|540908|
|    8|612279|
|    7|627931|
|    6|608665|
|    5|606293|
|    4|598126|
|    3|616090|
|    2|569236|
|   12|544958|
|   11|523272|
|   10|556205|
|    1|605765|
+-----+------+

