# New York City Taxi Analysis
In this example we show some analysis for yellow and green taxi trips originating in New York City in 2019. 

Orignal example can be found [here](https://github.com/toddwschneider/nyc-taxi-data)

The queries are originally in SQL format and here they are implemented using the pandas API. Dataset size is ~8GB.

In [1]:
import bodo
import pandas as pd
import time

<a id="loading_data"></a>
## Loading data

In this section, we load the yellow and green taxi trips and weather in central park area in 2019 in pandas DataFrame.

In [2]:
@bodo.jit(spawn=True, cache=True)
def get_trips():
    start = time.time()
    yellow_taxi = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/yellow_tripdata_2019.csv",
        usecols=[0, 1, 4, 7, 8],
        parse_dates=["tpep_pickup_datetime"],
    )
    
    yellow_taxi["cab_type_id"] = 0

    green_taxi = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/green_tripdata_2019.csv",
        usecols=[0, 1, 5, 6, 8],
        parse_dates=["lpep_pickup_datetime"],
    )
    green_taxi["cab_type_id"] = 1

    # Rename yellow_taxi column to match green_taxi
    yellow_taxi = yellow_taxi.rename(
        columns={"tpep_pickup_datetime": "lpep_pickup_datetime"}, copy=False
    )

    # concat
    trips = pd.concat([green_taxi, yellow_taxi])
    end = time.time()
    print("Time: ", end - start)
    return trips

trips = get_trips()
print(trips.head())

Time:  16.84070279799994
   VendorID lpep_pickup_datetime  PULocationID  DOLocationID  trip_distance  \
0         2  2018-12-21 15:17:29           264           264            0.0   
1         2  2019-01-01 00:10:16            97            49           0.86   
2         2  2019-01-01 00:27:11            49           189           0.66   
3         2  2019-01-01 00:46:20           189            17           2.68   
4         2  2019-01-01 00:19:06            82           258           4.53   

   cab_type_id  
0            1  
1            1  
2            1  
3            1  
4            1  


In [3]:
@bodo.jit(spawn=True, cache=True)
def get_cp_weather():
    start = time.time()
    central_park_weather_observations = pd.read_csv(
        "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", 
        parse_dates=["date"]
    )
    central_park_weather_observations["date"] = central_park_weather_observations[
        "date"
    ].dt.date
    end = time.time()
    print("Time: ", end - start)
    return central_park_weather_observations

central_park_weather_observations = get_cp_weather()
print(central_park_weather_observations.head())

Time:  0.1697279620000245
    station_id                 station_name        date  average_wind_speed  \
0  USW00094728  NY CITY CENTRAL PARK, NY US  2009-01-01               11.18   
1  USW00094728  NY CITY CENTRAL PARK, NY US  2009-01-02                6.26   
2  USW00094728  NY CITY CENTRAL PARK, NY US  2009-01-03               10.07   
3  USW00094728  NY CITY CENTRAL PARK, NY US  2009-01-04                7.61   
4  USW00094728  NY CITY CENTRAL PARK, NY US  2009-01-05                6.93   

   precipitation  snowfall  snow_depth  max_temperature  min_temperature  
0            0.0       0.0         0.0               26               15  
1            0.0       0.0         0.0               34               23  
2            0.0       0.0         0.0               38               29  
3            0.0       0.0         0.0               42               25  
4            0.0       0.0         0.0               43               38  


## Query Definitions

This section includes some of the queries using Python (Pandas)

### Q1: Display pickups by geography
This query reports the number of trips for each pickup location.

In [4]:
@bodo.jit(spawn=True, cache=True)
def get_daily_pickups(trips):
    start = time.time() 
    daily_pickups_taxi = trips.loc[:, ["cab_type_id", "PULocationID", "lpep_pickup_datetime"]]
    daily_pickups_taxi["pickup_date"] = daily_pickups_taxi["lpep_pickup_datetime"].dt.date
    daily_pickups_taxi = daily_pickups_taxi.groupby(                          
        ["cab_type_id", "PULocationID", "pickup_date"], as_index=False
    )["lpep_pickup_datetime"].count()                            
    daily_pickups_taxi = daily_pickups_taxi.rename(              
        columns={
            "PULocationID": "pickup_location_id",
            "pickup_date": "date",
            "lpep_pickup_datetime": "trips",
        },
        copy=False,
    )
    daily_pickups_taxi = daily_pickups_taxi.sort_values(
        by=["cab_type_id", "pickup_location_id", "date", "trips"],
        ascending=[True, True, True, False]
    )

    end = time.time()
    print("Time: ", end - start)
    return daily_pickups_taxi

daily_pickups = get_daily_pickups(trips)
print(daily_pickups.head())

Time:  0.44367357700002685
        cab_type_id  pickup_location_id        date  trips
154367            0                   1  2019-01-01     22
130275            0                   1  2019-01-02     26
106377            0                   1  2019-01-03     11
17150             0                   1  2019-01-04     17
116471            0                   1  2019-01-05     22


### Q2: JFK Hourly Pickups
This query reports the number of hourly pickups at JFK airport for each car type.

In [5]:
@bodo.jit(spawn=True, cache=True)
def get_jfk_hourly_pickups(trips):
    start = time.time()  
    jfk_hourly = trips.loc[:, ["cab_type_id", "PULocationID", "lpep_pickup_datetime"]]    
    jfk_hourly["pickup_hour"] = jfk_hourly["lpep_pickup_datetime"].dt.hour
    jfk_hourly = jfk_hourly.loc[jfk_hourly["PULocationID"] == 132]
    jfk_hourly = jfk_hourly.groupby(
        ["cab_type_id", "pickup_hour", "PULocationID"], as_index=False
    )["lpep_pickup_datetime"].count()
    jfk_hourly = jfk_hourly.rename(
        columns={
            "lpep_pickup_datetime": "trips",
            "PULocationID": "pickup_location_id",
        },
        copy=False,
    )    
    jfk_hourly = jfk_hourly.sort_values(
        by=["cab_type_id", "pickup_hour", "pickup_location_id", "trips"],
        ascending=[True, True, True, False]
    )

    end = time.time()
    print("Time: ", end - start)
    return jfk_hourly

jfk_hourly = get_jfk_hourly_pickups(trips)
print(jfk_hourly.head())    

Time:  0.3872183109999696
    cab_type_id  pickup_hour  pickup_location_id   trips
17            0            0                 132  116751
20            0            1                 132   61826
13            0            2                 132   22066
24            0            3                 132   11652
5             0            4                 132   16670


### Q3: Weekday trips
This query reports how many trips are done during weekdays.

In [6]:
@bodo.jit(spawn=True, cache=True)
def get_weekday_trips(trips):
    start = time.time()
    trips_weekdays = trips.loc[:, ["cab_type_id", "lpep_pickup_datetime", "PULocationID", "DOLocationID" ]]    
    trips_weekdays["pickup_dow"] = trips_weekdays["lpep_pickup_datetime"].dt.dayofweek
    trips_weekdays = trips_weekdays[
        (trips_weekdays["cab_type_id"].isin([0, 1]))
        & (trips_weekdays["lpep_pickup_datetime"] >= pd.to_datetime("2018-07-01"))
        & (trips_weekdays["lpep_pickup_datetime"] < pd.to_datetime("2020-07-01"))
        & (trips_weekdays["pickup_dow"].isin([1, 2, 3, 4, 5]))
    ]
    trips_weekdays = trips_weekdays.groupby(
        ["PULocationID", "DOLocationID"], as_index=False
    ).count()
    trips_weekdays = trips_weekdays.loc[:,
        ["PULocationID", "DOLocationID", "lpep_pickup_datetime"]
    ]
    trips_weekdays = trips_weekdays.rename(
        columns={
            "PULocationID": "pickup_location_id",
            "DOLocationID": "dropoff_location_id",
            "lpep_pickup_datetime": "trips"
        },
        copy=False,        
    )
    trips_weekdays = trips_weekdays.sort_values(by=["pickup_location_id", "dropoff_location_id", "trips"],
                                                ascending=[True, True, False])
    end = time.time()
    print("Time: ", end - start)
    return trips_weekdays

wd_trips = get_weekday_trips(trips)
print(wd_trips.head())

Time:  0.4398949880001055
       pickup_location_id  dropoff_location_id  trips
28577                   1                    1   5003
21536                   1                    4      1
5639                    1                    6      1
18030                   1                   20      1
28315                   1                   21      1


### Q4: Monthly Trips and Weather in Central Park
This query reports monthly travel times; the average distance, number of trips over and its corresponding weather.


In [7]:
@bodo.jit(spawn=True, cache=True)
def get_monthly_travels_weather(trips, central_park_weather_observations):
    start = time.time()
    trips = trips.loc[:, ["VendorID", "lpep_pickup_datetime", "PULocationID", "DOLocationID", "trip_distance" ]]    
    trips["date"] = trips["lpep_pickup_datetime"].dt.date
    trips["month"] = trips["lpep_pickup_datetime"].dt.month
    trips["hour"] = trips["lpep_pickup_datetime"].dt.hour
    trips["weekday"] = trips["lpep_pickup_datetime"].dt.dayofweek
    monthly_trips_weather = trips.merge(
        central_park_weather_observations, on="date", how="inner"
    )
    monthly_trips_weather = monthly_trips_weather[
        (monthly_trips_weather["weekday"].isin([1, 2, 3, 4, 5]))
        & (monthly_trips_weather["precipitation"] > 0.1)
    ]
    #0: morning, 1:midday, 2:afternoon, 3:evening, 4:other
    monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.replace({8: 0, 9:0, 10:0, 11:1, 12:1, 13:1, 14:1, 15:1, 16:2, 17:2, 18:2, 18:2, 19:3, 20:3, 21:3, 22:4, 23:4, 0:4, 1:4, 2:4, 3:4, 4:4, 5:4, 6:4, 7:4 })
    monthly_trips_weather = monthly_trips_weather.groupby(
        [
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "precipitation",
            "time_bucket",
        ],
        as_index=False
    ).agg({"VendorID": "count", "trip_distance": "mean"})
    monthly_trips_weather = monthly_trips_weather.sort_values(
        by=[
            "PULocationID",
            "DOLocationID",
            "month",
            "weekday",
            "precipitation",
            "time_bucket",
            "VendorID",
        ],
        ascending=[True, True, True, True, True, True, False]
    )
    monthly_trips_weather = monthly_trips_weather.rename(
        columns={
            "VendorID": "trips",
            "trip_distance": "avg_distance",
            "precipitation": "date_with_precipitation",
        },
        copy=False
    )
    end = time.time()
    print("Time: ", end - start)
    return monthly_trips_weather

monthly_trips_weather = get_monthly_travels_weather(trips, central_park_weather_observations)
print(monthly_trips_weather.head())

Time:  1.5221514489999208
         PULocationID  DOLocationID  month  weekday  date_with_precipitation  \
1787189             1             1      1        1                     0.17   
1368222             1             1      1        1                     0.17   
1560639             1             1      1        1                     0.23   
421163              1             1      1        1                     0.23   
914969              1             1      1        1                     0.23   

         time_bucket  trips  avg_distance  
1787189            1      7      2.671429  
1368222            2      1           0.0  
1560639            0      1          34.0  
421163             1      8       0.18875  
914969             2      3           4.6  
