In [101]:
from pyspark.sql import SparkSession
import os

import matplotlib.pyplot as plt
import seaborn as sns

# Jupyter 환경에서 대화형 그래프 활성화
%matplotlib widget

In [102]:
ss = SparkSession.builder\
.appName('YellowTaxi')\
.getOrCreate()

ss

In [103]:
trip_files = 'trips/*.csv'
zone_file = 'trips/code/lookup.csv'
directory = os.path.join(os.getcwd(), 'data')

In [126]:
df = ss.read.csv(f'file:///{directory}/{trip_files}', inferSchema = True, header = True)
df_zone = ss.read.csv(f'file:///{directory}/{zone_file}', inferSchema = True, header = True)
df, df_zone

                                                                                

(DataFrame[VendorID: int, tpep_pickup_datetime: string, tpep_dropoff_datetime: string, passenger_count: int, trip_distance: double, RatecodeID: int, store_and_fwd_flag: string, PULocationID: int, DOLocationID: int, payment_type: int, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double],
 DataFrame[LocationID: int, Borough: string, Zone: string, service_zone: string])

In [105]:
df.show(1)
df.printSchema()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       2| 2021-03-01 00:22:02|  2021-03-01 00:23:22|              1|          0.0|         1|                 N|         264|         264|           2|        3.0|  0.5|    0.5|       0.0|         0.0|                  0.3

## 방향
1. 쿼리 작성
2. 데이터 프레임으로 구현

## 전처리

### 임시 테이블 만들기

In [127]:
df.createOrReplaceTempView('table1')
df_zone.createOrReplaceTempView('table2')

### 음수는 뭐지

In [139]:
ss.sql('''
    SELECT
        payment_type
        , min(tip_amount)
        , max(tip_amount)
        , min(total_amount)
        , max(total_amount)
    FROM table1 as A
    where tip_amount < 0
        or total_amount < 0

    GROUP BY payment_type
    
''').show()

                                                                                

+------------+---------------+---------------+-----------------+-----------------+
|payment_type|min(tip_amount)|max(tip_amount)|min(total_amount)|max(total_amount)|
+------------+---------------+---------------+-----------------+-----------------+
|        null|          -6.33|          15.13|           -95.88|            50.02|
|           1|         -83.95|            0.0|          -133.35|             -3.3|
|           3|         -111.1|          32.06|           -647.8|            -0.31|
|           4|        -333.32|          40.06|          -554.05|             -0.3|
|           2|            0.0|          31.26|           -634.8|            -0.35|
+------------+---------------+---------------+-----------------+-----------------+



### 1. 운행 거리(trip_distance)와 요금(total_amount)의 상관관계 분석

In [107]:
ss.sql('''
    SELECT
        
        FLOOR(trip_distance/10) * 10 as range_distance
        , AVG(trip_distance)
        , ROUND(AVG(total_amount), 2) as avg_total_amount_by_range
        
    FROM table1
    
    GROUP BY range_distance    

    ORDER BY range_distance
''').show()



+--------------+------------------+-------------------------+
|range_distance|avg(trip_distance)|avg_total_amount_by_range|
+--------------+------------------+-------------------------+
|             0| 2.249990081277878|                    16.28|
|            10|14.511748083268245|                    55.74|
|            20| 22.35556516479053|                    73.88|
|            30| 34.01497395485508|                   117.13|
|            40| 44.20758012820514|                   134.92|
|            50|54.312714592274666|                   150.59|
|            60| 64.59808888888888|                   118.68|
|            70|  74.7025724637681|                   127.48|
|            80| 84.21412322274884|                   125.06|
|            90|  94.9186013986014|                    97.53|
|           100|105.33242990654206|                    82.72|
|           110|115.17875000000001|                    76.69|
|           120|124.55017857142857|                    45.14|
|       

                                                                                

In [108]:
df1 = ss.sql('''
    SELECT
        
        FLOOR(trip_distance/10) * 10 as range_distance
        , AVG(trip_distance)
        , ROUND(AVG(total_amount), 2) as avg_total_amount_by_range
        
    FROM table1
    
    GROUP BY range_distance    

    ORDER BY range_distance
''').toPandas()

                                                                                

In [109]:
df1

Unnamed: 0,range_distance,avg(trip_distance),avg_total_amount_by_range
0,0,2.249990,16.28
1,10,14.511748,55.74
2,20,22.355565,73.88
3,30,34.014974,117.13
4,40,44.207580,134.92
...,...,...,...
747,300930,300936.530000,35.30
748,305390,305397.570000,36.67
749,327490,327491.240000,19.88
750,332530,332532.180000,38.86


### 2. 피크 시간대 요금 분석

In [114]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [123]:
ss.sql('''

    SELECT
        HOUR(tpep_pickup_datetime) as time_hour
        , COUNT(*) as cnt_HOUR
        , AVG(total_amount) as AVG_total_amount_by_HOUR
    
    FROM table1
    
    GROUP BY time_hour

    ORDER BY time_hour

''').show()

                                                                                

+---------+--------+------------------------+
|time_hour|cnt_HOUR|AVG_total_amount_by_HOUR|
+---------+--------+------------------------+
|        0|  273553|      21.428823335879855|
|        1|  160424|      20.843909639453052|
|        2|   97218|       20.36273066715926|
|        3|   57806|      21.065730892986632|
|        4|   45410|       25.54333032371765|
|        5|   82073|      26.551492208156205|
|        6|  241020|      21.328937308104027|
|        7|  438938|       18.83996026773829|
|        8|  641530|      17.790024067469492|
|        9|  712912|      17.586356997785252|
|       10|  794621|      17.535767101557763|
|       11|  877952|      17.466696368376446|
|       12|  970018|      17.963276970129126|
|       13| 1001522|       18.14557307778716|
|       14| 1084111|      18.177726579671038|
|       15| 1091871|        18.6318919176479|
|       16| 1043498|      19.708538415994695|
|       17| 1085226|      19.291921083731793|
|       18| 1087217|      18.58196

### 3. 지불 유형별 요금, 팁 분석

In [132]:
ss.sql('''
    SELECT
        payment_type
        , AVG(tip_amount)
        , AVG(total_amount)
    FROM table1 as A

    GROUP BY payment_type
    ORDER BY payment_type
    
''').show()

                                                                                

+------------+--------------------+-------------------+
|payment_type|     avg(tip_amount)|  avg(total_amount)|
+------------+--------------------+-------------------+
|        null|  1.6014479849597305|  31.85317404211856|
|           1|  2.8802551464734534|  18.89057563281757|
|           3|-0.01341663187366456| 18.068943070461756|
|           5|                 0.0|               17.8|
|           4|0.005909593724859213|-2.7711340171627312|
|           2|3.441382791272625...|  15.42127102120272|
+------------+--------------------+-------------------+



In [None]:
ss.sql('''
select * from 

### 4. 승차 지역 / 하차 지역별 평균거리, 요금

In [172]:
ss.sql('''
    SELECT
        A.PULocationID
        , MAX(B.Zone) as PU_Zone
        , A.DOLocationID
        , MAX(C.Zone) as DO_Zone
        , ROUND(AVG(A.trip_distance), 2) as AVG_trip_distance
        , ROUND(AVG(A.total_amount), 2) as AVG_total_amount
        , COUNT(*) as cnt -- 건수 체크
    FROM table1 as A

    INNER JOIN table2 as B
    ON A.PULocationID = B.LocationID

    INNER JOIN table2 as C
    on A.DOLocationID = C.LocationID

    GROUP BY A.PULocationID, A.DOLocationID
''').show()



+------------+--------------------+------------+--------------------+-----------------+----------------+---+
|PULocationID|             PU_Zone|DOLocationID|             DO_Zone|AVG_trip_distance|AVG_total_amount|cnt|
+------------+--------------------+------------+--------------------+-----------------+----------------+---+
|           3|Allerton/Pelham G...|          57|              Corona|              4.8|           39.81|  2|
|           3|Allerton/Pelham G...|          89|Flatbush/Ditmas Park|            23.01|           74.73|  2|
|           4|       Alphabet City|         185|      Pelham Parkway|            10.91|           42.06| 11|
|           7|             Astoria|          55|        Coney Island|            14.82|           59.38| 23|
|           7|             Astoria|         132|         JFK Airport|            14.19|           49.42| 77|
|          10|        Baisley Park|          85|             Erasmus|            11.19|           49.95| 29|
|          10|     

                                                                                

In [174]:
ss.sql('''
    SELECT
        A.PULocationID
        , MAX(B.Zone) as PU_Zone
        , A.DOLocationID
        , MAX(C.Zone) as DO_Zone
        , ROUND(AVG(A.trip_distance), 2) as AVG_trip_distance
        , ROUND(AVG(A.total_amount), 2) as AVG_total_amount
        , COUNT(*) as cnt -- 건수 체크
    FROM table1 as A

    INNER JOIN table2 as B
    ON A.PULocationID = B.LocationID

    INNER JOIN table2 as C
    on A.DOLocationID = C.LocationID

    GROUP BY A.PULocationID, A.DOLocationID
''').toPandas()

                                                                                

Unnamed: 0,PULocationID,PU_Zone,DOLocationID,DO_Zone,AVG_trip_distance,AVG_total_amount,cnt
0,3,Allerton/Pelham Gardens,57,Corona,4.80,39.81,2
1,3,Allerton/Pelham Gardens,89,Flatbush/Ditmas Park,23.01,74.73,2
2,4,Alphabet City,185,Pelham Parkway,10.91,42.06,11
3,7,Astoria,55,Coney Island,14.82,59.38,23
4,7,Astoria,132,JFK Airport,14.19,49.42,77
...,...,...,...,...,...,...,...
46178,264,NV,191,Queens Village,11.14,39.56,31
46179,265,,94,Fordham South,7.15,34.97,117
46180,265,,116,Hamilton Heights,7.07,36.21,476
46181,265,,133,Kensington,6.22,35.51,154


### 5. 팁의 비율에 따른 거리, 여행 건수 서비스 관련 분석

In [181]:
df_zone.printSchema()
df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [190]:
ss.sql('''
        SELECT
        
            FLOOR(FLOOR(((tip_amount / total_amount) * 100))/10) * 10 as tip_ratio
            
        FROM table1
        where FLOOR(FLOOR(((tip_amount / total_amount) * 100))/10) * 10 < 0

''').show()

+---------+
|tip_ratio|
+---------+
|      -30|
|      -40|
|      -30|
|      -30|
|      -30|
|      -20|
|      -30|
|      -30|
|      -30|
|      -20|
|      -40|
|      -30|
|      -30|
|      -30|
|      -30|
|      -30|
|      -20|
|      -30|
|      -40|
|      -30|
+---------+
only showing top 20 rows



                                                                                

In [187]:
ss.sql('''
        SELECT
        
            FLOOR(FLOOR(((tip_amount / total_amount) * 100))/10) * 10 as tip_ratio
            , AVG(trip_distance) as AVG_trip_distance
            , AVG(tip_amount) as AVG_tip_amount
            , AVG(total_amount) as AVG_total_amount
            
        FROM table1

        GROUP BY tip_ratio

''').show()

                                                                                

+---------+--------------------+--------------------+-------------------+
|tip_ratio|   AVG_trip_distance|      AVG_tip_amount|   AVG_total_amount|
+---------+--------------------+--------------------+-------------------+
|        0|  11.098819481423762|  0.3377800550066132|  18.48767277735558|
|      -30|   2.932072243346008|   3.546254752851711|-14.252034220532323|
|      -60|  1.7759999999999998|                 6.2|            -12.018|
|      -90|                 0.0|               20.02|             -22.78|
|       50|   7.006356766968022|   14.73387801916071|  27.51974092565084|
|     -440|                1.71|               10.01|              -2.29|
|      -50|   3.112857142857143|   5.046904761904762|-11.773333333333333|
|    -5670|                 0.0|                17.0|               -0.3|
|     null|  0.9797847600197922|0.003537852548243444|                0.0|
|     -110|               2.715|                 7.5| -7.300000000000001|
|       10|   3.985048523294628|   3.0

In [191]:
ss.stop()