In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
   .appName("HW3") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [6]:
sc

In [7]:
### Data understanding

In [8]:
! wget https://storage.googleapis.com/class25jan2022/share/2008.csv

--2022-02-06 06:12:09--  https://storage.googleapis.com/class25jan2022/share/2008.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.148.128, 172.217.212.128, 172.217.214.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.148.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 689413344 (657M) [text/csv]
Saving to: ‘2008.csv’


2022-02-06 06:12:13 (202 MB/s) - ‘2008.csv’ saved [689413344/689413344]



In [9]:
! wc -l ./2008.csv

7009729 ./2008.csv


In [10]:
! head -3 2008.csv

Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
2008,1,3,4,2003,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA
2008,1,3,4,754,735,1002,1000,WN,3231,N772SW,128,145,113,2,19,IAD,TPA,810,5,10,0,,0,NA,NA,NA,NA,NA


In [11]:
raw_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('2008.csv')

### Pic1

In [12]:
raw_df.count()

7009728

### Pic2

In [13]:
raw_df.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: string (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: string (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: string (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: string (nullable = true)
 |-- CarrierDelay:

### Pic3

In [14]:
raw_df.select(['DepTime', 'TaxiOut', 'TaxiIn', 'DayOfWeek', 'Distance', 'ArrDelay']).describe().show()

+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|summary|           DepTime|           TaxiOut|           TaxiIn|         DayOfWeek|         Distance|         ArrDelay|
+-------+------------------+------------------+-----------------+------------------+-----------------+-----------------+
|  count|           7009728|           7009728|          7009728|           7009728|          7009728|          7009728|
|   mean|1333.8300461105448|16.453045177492882|6.860851704974527|3.9241815088973495|726.3870294253928| 8.16845238729114|
| stddev|478.06889486629836|11.332798654232155|4.933649371300466|1.9882589459851212|562.1018034840403|38.50193694882867|
|    min|                 1|                 0|                0|                 1|              100|               -1|
|    max|                NA|                NA|               NA|                 7|              999|               NA|
+-------+------------------+----

### Pic4

In [15]:
raw_df.groupBy('month').count().orderBy('count',ascending=False).show()

+-----+------+
|month| count|
+-----+------+
|    7|627931|
|    3|616090|
|    8|612279|
|    6|608665|
|    5|606293|
|    1|605765|
|    4|598126|
|    2|569236|
|   10|556205|
|   12|544958|
|    9|540908|
|   11|523272|
+-----+------+



### Pic5

In [16]:
from pyspark.sql import functions as F

In [17]:
raw_df.groupBy('month').agg(F.count('month'),F.avg('ArrDelay')) \
.withColumnRenamed('count(month)','numMonth') \
.withColumnRenamed('avg(ArrDelay)','avgArrDelay') \
.orderBy('numMonth',ascending=False).show()

+-----+--------+------------------+
|month|numMonth|       avgArrDelay|
+-----+--------+------------------+
|    7|  627931| 9.975049681276131|
|    3|  616090| 11.19236458018227|
|    8|  612279|  6.91091468997087|
|    6|  608665|13.266756009659792|
|    5|  606293| 5.978448290248828|
|    1|  605765|10.188855960349496|
|    4|  598126| 6.807297481094145|
|    2|  569236|13.077836997760205|
|   10|  556205|0.4154954706912698|
|   12|  544958|16.680505081496417|
|    9|  540908|0.6977328787273043|
|   11|  523272| 2.015857969430839|
+-----+--------+------------------+



# Home work 4

### แนวคิด
จากโจทย์ใช้ columns DepTime มาทำการหา period โดยดูจากข้อมูล แล้วนำมาเป็นช่วงเวลา เช่น

735 = 07.35

ซึ่งจาก ผลลัพธ์ของโจ่ทย์ที่ได้รับมาจะสนใจเพียงแค่ จำนวนชั่วโมงเลยทำการ เติม 0 เพื่อให้ split ข้อความได้เป็น 2 ตำแหน่งแรก

**ทำไมถึงต้องทำ**

ค่า min ของ Deptime = 1 ซึ่งอาจจะหมายความว่ามีค่าเป็นเลขตัวเดียวทำให้ไม่สามารถนำ จำนวนชั่วโมงออกมาได้ เลยได้ทำการเติม 0 เข้าไปเพิ่อให้ครบ 4 ตำแหน่ง เช่น

1 = 0001 เพื่อให้ตัดออกมาได้เป็น 00 ตามชั่วโมง

จากนั้นเมื่อได้ ชั่วโมงมาแล้วค่อยนำไปหา period ตามเงื่อนไข ของการ groupby

**แนวคิดแรกติดปัญหา**

ตัดตัว ชั่วโมงมา แต่มันมีค่าที่เป็น "NA" มีค่า 2 ตำแหน่ง เลยทำค่า Minute ขึ้นมาด้วย แล้วเช็คจาก Minute ที่เป็น "NA" ก่อนแล้วค่อย หา period ต่อ 

In [87]:
from pyspark.sql.functions import *

time_df = raw_df.select('DepTime')

In [89]:
add_zero = lambda x : '000'+x if(len(x) == 1) else '00'+x if(len(x) == 2) else '0'+x if(len(x) ==3) else x
split_hr = udf(lambda x: add_zero(x)[0:2])
split_min = udf(lambda x: add_zero(x)[2:])
time_df = time_df.withColumn('Hr',split_hr(col('Deptime'))).withColumn('Mn',split_min(col('Deptime')))

In [124]:
check_time = lambda x, y : 'null' if(y == 'NA') \
                    else '00.01 - 05.59' if((int(x) >= 0) and (int(x) <=5)) \
                    else '06.00 - 11.59' if((int(x) >= 6) and (int(x) <=11)) \
                    else '12.00 - 17.59' if((int(x) >= 12) and (int(x) <=17)) \
                    else '18.00 - 24.00' if((int(x) >= 18) and (int(x) <=24)) \
                    else x
time_period = udf(lambda x,y : check_time(x,y))

result_df = time_df.withColumn('DepTime', time_period(col('Hr'),col('Mn'))).select('DepTime')
# result_df.show()

In [123]:
result_df.groupBy('DepTime').count().orderBy(col('DepTime')).show()

+-------------+-------+
|      DepTime|  count|
+-------------+-------+
|00.01 - 05.59| 179949|
|06.00 - 11.59|2643673|
|12.00 - 17.59|2554672|
|18.00 - 24.00|1495188|
|         null| 136246|
+-------------+-------+

