<a href="https://colab.research.google.com/github/aekanun2020/2022-PUB_COC-Data-Science-for-Tourism/blob/main/inClass5Oct_WORKSHOP_Spark_Missing_EDAandDataPrep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

sc

In [2]:
! wget https://storage.googleapis.com/5-7sep2022/data/2008.csv

--2022-10-05 07:03:00--  https://storage.googleapis.com/5-7sep2022/data/2008.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.125.128, 142.250.136.128, 142.250.148.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.125.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 689413344 (657M) [text/csv]
Saving to: ‘2008.csv’


2022-10-05 07:03:06 (114 MB/s) - ‘2008.csv’ saved [689413344/689413344]



In [25]:
raw_df = spark.read.option('header','true').option('inferSchema','true').csv('./2008.csv')

# **2. Data Understanding**

ดู Missing, Dispersion, Outlier

In [26]:
raw_df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Car

In [27]:
raw_df.count()

7009728

In [28]:
raw_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Year,7009728,2008.0,1.6928719673098015E-13,2008,2008
Month,7009728,6.375129676928976,3.4067366032507986,1,12
DayofMonth,7009728,15.72801483880687,8.797067919493633,1,31
DayOfWeek,7009728,3.9241815088973495,1.9882589459851212,1,7
DepTime,7009728,1333.8300461105448,478.06889486629836,1,
CRSDepTime,7009728,1326.0856632382884,464.2509106889191,0,2359
ArrTime,7009728,1481.258226684178,505.22512933801556,1,
CRSArrTime,7009728,1494.8011536253618,482.67282151078666,0,2400
UniqueCarrier,7009728,,,9E,YV


ข้างบนเห็นว่า มี Null อยู่ที่ CancellationCode และ TailNum

In [29]:
from pyspark.sql import functions as sparkf

In [30]:
raw_df.select(
  [sparkf.count(sparkf.when(sparkf.isnull(c), c)).alias(c) for c in raw_df.columns]
).toPandas().transpose()



Unnamed: 0,0
Year,0
Month,0
DayofMonth,0
DayOfWeek,0
DepTime,0
CRSDepTime,0
ArrTime,0
CRSArrTime,0
UniqueCarrier,0
FlightNum,0


In [31]:
raw_df.groupBy('ArrDelay').count()\
.orderBy('ArrDelay',ascending=False).show()

+--------+------+
|ArrDelay| count|
+--------+------+
|      NA|154699|
|     998|     2|
|     994|     2|
|     993|     1|
|     992|     1|
|     991|     1|
|     990|     2|
|      99|  3900|
|     989|     1|
|     988|     3|
|     984|     2|
|     983|     2|
|     982|     2|
|     981|     1|
|     980|     1|
|      98|  3877|
|     979|     1|
|     978|     1|
|     977|     1|
|     976|     2|
+--------+------+
only showing top 20 rows



In [41]:
from pyspark.sql.types import *

In [49]:
correctedType_df = raw_df.withColumn('DepDelay',sparkf.col('DepDelay').cast(DoubleType()))\
.withColumn('ArrDelay',sparkf.col('ArrDelay').cast(DoubleType()))

In [50]:
correctedType_df.select('ArrDelay','DepDelay').dropna().count()

6855029

In [51]:
correctedType_df.select('ArrDelay','DepDelay').dropna().corr('ArrDelay','DepDelay')

0.9313907801110134

In [None]:
#CarrierDelay: 0.5, #WeatherDelay: 0.2, UniqueCarrier, DayofMonth, DayofWeek

# **3. Data Preparation**

In [53]:
selectedCol_df = raw_df.select('ArrDelay','DepDelay','CarrierDelay','WeatherDelay','UniqueCarrier','DayofMonth','DayofWeek')

In [54]:
selectedCol_df.printSchema()

root
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- CarrierDelay: string (nullable = true)
 |-- WeatherDelay: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayofWeek: integer (nullable = true)



In [55]:
selectedCol_df.describe().show()

+-------+-----------------+------------------+------------------+-----------------+-------------+-----------------+------------------+
|summary|         ArrDelay|          DepDelay|      CarrierDelay|     WeatherDelay|UniqueCarrier|       DayofMonth|         DayofWeek|
+-------+-----------------+------------------+------------------+-----------------+-------------+-----------------+------------------+
|  count|          7009728|           7009728|           7009728|          7009728|      7009728|          7009728|           7009728|
|   mean| 8.16845238729114| 9.972570088930182|15.772063342154539|3.039031044738922|         null|15.72801483880687|3.9241815088973495|
| stddev|38.50193694882867|35.311270777552785| 40.09911594635346|19.50287413039825|         null|8.797067919493633|1.9882589459851212|
|    min|               -1|                -1|                 0|                0|           9E|                1|                 1|
|    max|               NA|                NA|         

In [56]:
selectedCol_df\
.withColumn('DepDelay',sparkf.col('DepDelay').cast(DoubleType()))\
.withColumn('ArrDelay',sparkf.col('ArrDelay').cast(DoubleType()))\
.withColumn('CarrierDelay',sparkf.col('CarrierDelay').cast(DoubleType()))\
.withColumn('WeatherDelay',sparkf.col('WeatherDelay').cast(DoubleType()))\
.describe().show()

+-------+-----------------+------------------+------------------+-----------------+-------------+-----------------+------------------+
|summary|         ArrDelay|          DepDelay|      CarrierDelay|     WeatherDelay|UniqueCarrier|       DayofMonth|         DayofWeek|
+-------+-----------------+------------------+------------------+-----------------+-------------+-----------------+------------------+
|  count|          6855029|           6873482|           1524735|          1524735|      7009728|          7009728|           7009728|
|   mean| 8.16845238729114| 9.972570088930182|15.772063342154539|3.039031044738922|         null|15.72801483880687|3.9241815088973495|
| stddev|38.50193694882867|35.311270777552785| 40.09911594635346|19.50287413039825|         null|8.797067919493633|1.9882589459851212|
|    min|           -519.0|            -534.0|               0.0|              0.0|           9E|                1|                 1|
|    max|           2461.0|            2467.0|         

In [58]:
ABT_df = selectedCol_df\
.withColumn('DepDelay',sparkf.col('DepDelay').cast(DoubleType()))\
.withColumn('ArrDelay',sparkf.col('ArrDelay').cast(DoubleType()))\
.withColumn('CarrierDelay',sparkf.col('CarrierDelay').cast(DoubleType()))\
.withColumn('WeatherDelay',sparkf.col('WeatherDelay').cast(DoubleType()))\
.dropna()

In [59]:
ABT_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ArrDelay,1524735,56.821672618520594,57.163519606845306,15.0,2461.0
DepDelay,1524735,48.53711431822579,59.10622675425502,-61.0,2467.0
CarrierDelay,1524735,15.772063342154539,40.09911594635346,0.0,2436.0
WeatherDelay,1524735,3.039031044738922,19.50287413039825,0.0,1352.0
UniqueCarrier,1524735,,,9E,YV
DayofMonth,1524735,15.700510580527109,8.781591233385774,1,31
DayofWeek,1524735,3.9639150409743333,1.9844368422404373,1,7


In [60]:
ABT_df.printSchema()

root
 |-- ArrDelay: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- CarrierDelay: double (nullable = true)
 |-- WeatherDelay: double (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayofWeek: integer (nullable = true)

