In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

### 미국 항공사 데이터 예제

**데이터 불러오기**

In [8]:
# SparkSession 생성
spark = (SparkSession
         .builder
         .appName("SparkSQLExampleApp")
         .getOrCreate())

In [11]:
# 데이터 불러오기
schema = "date STRING, delay INT, distance INT, origin STRING, destination STRING"

df = (spark.read.format("csv")
      .option("inferSchema", "true")
      .option("header", "true")
      .load("departuredelays.csv", schema=schema))

df.createOrReplaceTempView("us_delay_flights_tbl")

In [12]:
df.show(5)

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|01020600|   -8|     369|   ABE|        DTW|
|01021245|   -2|     602|   ABE|        ATL|
|01020605|   -4|     602|   ABE|        ATL|
|01031245|   -4|     602|   ABE|        ATL|
+--------+-----+--------+------+-----------+
only showing top 5 rows



In [13]:
# 비행거리가 1000마일 이상인 항공편
spark.sql("""SELECT distance, origin, destination
             FROM us_delay_flights_tbl WHERE distance > 1000
             ORDER BY distance DESC""").show(10)



+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



                                                                                

In [14]:
# 샌프란시스코(SFO)와 시카고(ORD)간 2시간 이상 지연된 항공편
spark.sql("""SELECT date, delay, origin, destination
             FROM us_delay_flights_tbl
             WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD'
             ORDER by delay DESC""").show(10)



+--------+-----+------+-----------+
|    date|delay|origin|destination|
+--------+-----+------+-----------+
|02190925| 1638|   SFO|        ORD|
|01031755|  396|   SFO|        ORD|
|01022330|  326|   SFO|        ORD|
|01051205|  320|   SFO|        ORD|
|01190925|  297|   SFO|        ORD|
|02171115|  296|   SFO|        ORD|
|01071040|  279|   SFO|        ORD|
|01051550|  274|   SFO|        ORD|
|03120730|  266|   SFO|        ORD|
|01261104|  258|   SFO|        ORD|
+--------+-----+------+-----------+
only showing top 10 rows



                                                                                

In [34]:
# date data type 변경
from pyspark.sql.functions import *

add_df = (df.withColumn("newdate", to_timestamp(col("date"), "MMddhhmm"))
             .withColumn("month", month(col("newdate")))
             .withColumn("day", dayofmonth(col("newdate"))))

add_df.createOrReplaceTempView("us_delay_flights_tbl_add_ver")

In [36]:
add_df.show(5)

+--------+-----+--------+------+-----------+-------------------+-----+---+
|    date|delay|distance|origin|destination|            newdate|month|day|
+--------+-----+--------+------+-----------+-------------------+-----+---+
|01011245|    6|     602|   ABE|        ATL|1970-01-01 00:45:00|    1|  1|
|01020600|   -8|     369|   ABE|        DTW|1970-01-02 06:00:00|    1|  2|
|01021245|   -2|     602|   ABE|        ATL|1970-01-02 00:45:00|    1|  2|
|01020605|   -4|     602|   ABE|        ATL|1970-01-02 06:05:00|    1|  2|
|01031245|   -4|     602|   ABE|        ATL|1970-01-03 00:45:00|    1|  3|
+--------+-----+--------+------+-----------+-------------------+-----+---+
only showing top 5 rows



In [49]:
spark.sql("""SELECT month, day, count(*), avg(delay)
             FROM us_delay_flights_tbl_add_ver
             WHERE ORIGIN = 'SFO' AND DESTINATION = 'ORD'
             GROUP BY month, day
             ORDER by month, day DESC""").show()



+-----+----+--------+-------------------+
|month| day|count(1)|         avg(delay)|
+-----+----+--------+-------------------+
| null|null|     741| 17.994601889338732|
|    1|  31|      13| 2.4615384615384617|
|    1|  30|      13|  36.15384615384615|
|    1|  29|      12|              -1.75|
|    1|  28|      12|  4.416666666666667|
|    1|  27|      13|  4.230769230769231|
|    1|  26|      11| 20.636363636363637|
|    1|  25|       8|              -3.75|
|    1|  24|      13|  18.46153846153846|
|    1|  23|      13| 3.1538461538461537|
|    1|  22|      12|              19.25|
|    1|  21|      12|              13.25|
|    1|  20|      13|                6.0|
|    1|  19|      11|  23.90909090909091|
|    1|  18|       8|              5.875|
|    1|  17|      13| 0.6923076923076923|
|    1|  16|      13| 15.461538461538462|
|    1|  15|      12|0.08333333333333333|
|    1|  14|      12|-1.5833333333333333|
|    1|  13|      13|  2.230769230769231|
+-----+----+--------+-------------

                                                                                

In [7]:
spark.stop()