In [11]:
from pyspark.sql import SparkSession

In [12]:
ss = SparkSession.builder.appName('SparkSQL').getOrCreate()
ss

In [13]:
df = ss.read.csv('data/fhvhv_tripdata_2020-03.csv', inferSchema = True, header = True)

df.createOrReplaceTempView('mobility_data')

                                                                                

### 서브쿼리
두번 나누어 실행하는 두 개의 쿼리를 하나로 합치는 방법으로 사용하는 것이 서브쿼리  
때론, 성능 문제를 일으킬 수 있다

- 다중행(from)
- 단일행(=) 연산자와 같이 쓴다


In [14]:
ss.sql('''
    SELECT
        *
    FROM mobility_data
    LIMIT 5;
''').show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   null|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   null|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   null|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   null|
+-----------------+--------------------+-------------------+-------------------+

In [15]:
ss.sql('''
    SELECT
        SPLIT(pickup_datetime, ' ')[0] as pickup_date
    , COUNT(*) as trips
    
    FROM mobility_data
    
    GROUP BY pickup_date


'''
).show()

                                                                                

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-16|391518|
| 2020-03-03|697880|
| 2020-03-06|872012|
| 2020-03-26|141607|
| 2020-03-05|731165|
| 2020-03-02|648986|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
| 2020-03-04|707879|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-13|660914|
| 2020-03-27|159339|
| 2020-03-22|162165|
| 2020-03-28|138456|
| 2020-03-01|784246|
| 2020-03-19|252773|
| 2020-03-09|628940|
+-----------+------+
only showing top 20 rows



In [18]:
ss.sql('''
    SELECT
        SPLIT(pickup_datetime, ' ')[0] as pickup_date
    , COUNT(*) as trips
    
    FROM mobility_data
    
    GROUP BY pickup_date


'''
).explain(True) #True 주면 논리 계획도 같이 제공함

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['SPLIT('pickup_datetime,  )[0] AS pickup_date#271, 'COUNT(1) AS trips#272]
+- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [split(pickup_datetime#173,  , -1)[0]], [split(pickup_datetime#173,  , -1)[0] AS pickup_date#271, count(1) AS trips#272L]
+- SubqueryAlias mobility_data
   +- Relation[hvfhs_license_num#171,dispatching_base_num#172,pickup_datetime#173,dropoff_datetime#174,PULocationID#175,DOLocationID#176,SR_Flag#177] csv

== Optimized Logical Plan ==
Aggregate [split(pickup_datetime#173,  , -1)[0]], [split(pickup_datetime#173,  , -1)[0] AS pickup_date#271, count(1) AS trips#272L]
+- Project [pickup_datetime#173]
   +- Relation[hvfhs_license_num#171,dispatching_base_num#172,pickup_datetime#173,dropoff_datetime#174,PULocationID#175,DOLocationID#176,SR_Flag#177] csv

== Physical Plan ==
*(2) HashAggregate(keys=[split(pickup_datetime#173,  , -1)[0]#276], fu

In [16]:
ss.sql('''
    SELECT 
        pickup_date, 
        COUNT(*) AS trips
    FROM
        (
        SELECT
            SPLIT(pickup_datetime, ' ')[0] as pickup_date
        
        FROM mobility_data
        )

    GROUP BY pickup_date
''').show()

                                                                                

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-16|391518|
| 2020-03-03|697880|
| 2020-03-06|872012|
| 2020-03-26|141607|
| 2020-03-05|731165|
| 2020-03-02|648986|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
| 2020-03-04|707879|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-13|660914|
| 2020-03-27|159339|
| 2020-03-22|162165|
| 2020-03-28|138456|
| 2020-03-01|784246|
| 2020-03-19|252773|
| 2020-03-09|628940|
+-----------+------+
only showing top 20 rows



In [19]:
ss.sql('''
    SELECT 
        pickup_date, 
        COUNT(*) AS trips
    FROM
        (
        SELECT
            SPLIT(pickup_datetime, ' ')[0] as pickup_date
        
        FROM mobility_data
        )

    GROUP BY pickup_date
''').explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'COUNT(1) AS trips#280]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['SPLIT('pickup_datetime,  )[0] AS pickup_date#279]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#279], [pickup_date#279, count(1) AS trips#280L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(pickup_datetime#173,  , -1)[0] AS pickup_date#279]
      +- SubqueryAlias mobility_data
         +- Relation[hvfhs_license_num#171,dispatching_base_num#172,pickup_datetime#173,dropoff_datetime#174,PULocationID#175,DOLocationID#176,SR_Flag#177] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#279], [pickup_date#279, count(1) AS trips#280L]
+- Project [split(pickup_datetime#173,  , -1)[0] AS pickup_date#279]
   +- Relation[hvfhs_license_num#171,dispatching_base_num#172,pickup_datetime#173,dropoff_datetime#174,PULocationI

## 결론 및 책에서 하고자 하는 말
Spark.sql의 카탈리스트 최적화 엔진에 의해, 물리적으로 최적의 실행계획이 만들어짐  
실제로, 바이트코드로 변환해서 실행하게 됨  
  
그럼에도 불구하고, GPT를 사용하는 것 처럼, 사람이 최적의 코드를 고민하고 개선시켜 나아가야할 줄 알아야함

In [20]:
ss.stop()