# Analysis of NYC Taxi Cab Data
## Import libraries and connect to database

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import os
import datetime as dt
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# set working directory up a level from this file
os.chdir('..')

# Connect to database:
con = sqlite3.connect("NYC-Taxi.db")

## Extract observations and clean data

The analysis will focus on sample of a spedified number of observations of green and yellow cab data. Data from each of these sources will be stacked together, features created, and then finally a train and test set created.

In [2]:
# need to set seed by randomly sampling indices in python then pass to sql

# specify number of rows with pull variable 
pull = "5000000"
# Store sample of green and yellow cab data into dataframes:
df1 = pd.read_sql(f"SELECT * FROM green_cabs ORDER BY random() LIMIT" + " " +  pull, con=con)
df2 = pd.read_sql(f"SELECT * from yellow_cabs ORDER BY random() LIMIT"+ " " + pull, con=con)

# Add labels for green and yellow cabs and rename pickup/dropoff datetime columns:
df1 = df1.rename(columns={"lpep_pickup_datetime": "pickup_datetime", "lpep_dropoff_datetime":"dropoff_datetime"})
df2 = df2.rename(columns={"tpep_pickup_datetime": "pickup_datetime", "tpep_dropoff_datetime":"dropoff_datetime"})
df1['cab'] = "green"
df2['cab'] = "yellow"

In [3]:
# retain only columns that are in both datasets
cols_to_keep = df1.columns.intersection(df2.columns)
df1 = df1[cols_to_keep]
df2 = df2[cols_to_keep]
del cols_to_keep

# Combine into one data frame:
df = pd.concat([df1, df2], sort=False)
del df1
del df2

# Change format to datetime where necessary:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

df.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge,Source_file,cab
0,1.0,2019-01-01 22:03:39,2019-01-01 22:10:50,N,1.0,42,127,4.0,3.1,11.0,0.5,0.5,0.0,0.0,0.3,12.3,2.0,,Data/green_tripdata_2019-01.csv,green
1,,2019-07-17 13:00:00,2019-07-17 13:55:00,,,61,117,,14.57,43.66,2.75,0.5,0.0,2.29,0.0,49.2,,,Data/green_tripdata_2019-07.csv,green
2,2.0,2019-12-23 15:39:36,2019-12-23 15:48:03,N,1.0,7,260,1.0,1.56,8.0,0.0,0.5,0.0,0.0,0.3,8.8,2.0,0.0,Data/green_tripdata_2019-12.csv,green
3,2.0,2019-07-14 01:24:36,2019-07-14 01:49:58,N,1.0,25,142,1.0,8.43,27.0,0.5,0.5,6.21,0.0,0.3,37.26,1.0,2.75,Data/green_tripdata_2019-07.csv,green
4,2.0,2019-02-19 11:49:32,2019-02-19 12:07:04,N,1.0,197,19,1.0,9.67,28.0,0.0,0.5,0.0,0.0,0.3,28.8,1.0,0.0,Data/green_tripdata_2019-02.csv,green


In [4]:
df.shape

(10000000, 20)

In [5]:
df.describe()

Unnamed: 0,VendorID,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge
count,9642746.0,9642746.0,10000000.0,10000000.0,9642746.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,9999998.0,10000000.0,9642746.0,8917880.0
mean,1.740138,1.122003,135.8082,145.2809,1.441076,3.156042,13.85901,0.8119563,0.4885956,1.587545,0.3180196,0.283815,18.16343,1.362231,1.344383
std,0.449762,0.8240927,74.16501,75.17733,1.107262,25.86621,13.14328,1.1017,0.08397701,2.514358,1.588999,0.071011,15.14469,0.505183,1.269028
min,1.0,1.0,1.0,1.0,0.0,-16183.31,-890.0,-39.2,-0.5,-98.76,-22.0,-0.3,-890.3,1.0,-2.75
25%,1.0,1.0,74.0,75.0,1.0,1.0,6.5,0.0,0.5,0.0,0.0,0.3,9.8,1.0,0.0
50%,2.0,1.0,137.0,145.0,1.0,1.78,10.0,0.5,0.5,1.0,0.0,0.3,13.8,1.0,2.5
75%,2.0,1.0,196.0,226.0,1.0,3.5,16.0,1.0,0.5,2.36,0.0,0.3,20.8,2.0,2.5
max,4.0,99.0,265.0,265.0,9.0,77843.76,9434.0,87.56,40.5,787.25,935.5,0.44,9435.8,5.0,2.75


### Remove outliers

In [6]:
df.quantile(q=np.array(range(0,11))*0.1)

Unnamed: 0,VendorID,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,congestion_surcharge
0.0,1.0,1.0,1.0,1.0,0.0,-16183.31,-890.0,-39.2,-0.5,-98.76,-22.0,-0.3,-890.3,1.0,-2.75
0.1,1.0,1.0,41.0,42.0,1.0,0.61,5.0,0.0,0.5,0.0,0.0,0.3,7.3,1.0,0.0
0.2,1.0,1.0,66.0,69.0,1.0,0.9,6.0,0.0,0.5,0.0,0.0,0.3,9.3,1.0,0.0
0.3,2.0,1.0,79.0,90.0,1.0,1.13,7.0,0.0,0.5,0.0,0.0,0.3,10.56,1.0,0.0
0.4,2.0,1.0,100.0,129.0,1.0,1.41,8.5,0.0,0.5,0.0,0.0,0.3,12.25,1.0,0.0
0.5,2.0,1.0,137.0,145.0,1.0,1.78,10.0,0.5,0.5,1.0,0.0,0.3,13.8,1.0,2.5
0.6,2.0,1.0,161.0,164.0,1.0,2.25,11.5,0.5,0.5,1.62,0.0,0.3,15.95,1.0,2.5
0.7,2.0,1.0,170.0,196.0,1.0,2.97,14.5,1.0,0.5,2.06,0.0,0.3,18.82,2.0,2.5
0.8,2.0,1.0,230.0,233.0,2.0,4.26,19.0,1.0,0.5,2.76,0.0,0.3,23.76,2.0,2.5
0.9,2.0,1.0,239.0,239.0,2.0,7.44,28.0,2.75,0.5,3.99,0.0,0.3,34.05,2.0,2.5


In [7]:
df.shape

(10000000, 20)

Based on the above quantiles, the following filters should be applied:
- trip_distance > 0
- passenget_count > 0
- fare_amount > 0
- extra >= 0
- mta_tax >= 0
- tip_amount >= 0
- tolls_amount >= 0
- improvement_surcharge >= 0
- total_amount > 0
- congestion_surcharge >= 0

The upper limits should also be capped at the 99.9th percentile for:
- trip_distance
- fare_amount
- trip_time

Additionally, we only want to look at credit transactions because cash transactions are less likely to have a tip registered:
- payment_type == 1

In [9]:
indices = (df['trip_distance'] > 0) &\
    (df['passenger_count'] > 0) &\
    (df['fare_amount'] > 0) &\
    (df['extra'] >= 0) &\
    (df['mta_tax'] >= 0) &\
    (df['tip_amount'] >= 0) &\
    (df['tolls_amount'] >= 0) &\
    (df['improvement_surcharge'] >= 0) &\
    (df['total_amount'] > 0) &\
    (df['congestion_surcharge'] >= 0) &\
    (df['payment_type'] == 1) &\
    (df['trip_distance'] <= df['trip_distance'].quantile(.999)) &\
    (df['fare_amount'] <= df['fare_amount'].quantile(.999))

df_cleaned = df[indices]
del indices

df_cleaned.shape

(5607244, 20)

### Check for NAs

In [10]:
df_cleaned.isna().sum()

VendorID                 0
pickup_datetime          0
dropoff_datetime         0
store_and_fwd_flag       0
RatecodeID               0
PULocationID             0
DOLocationID             0
passenger_count          0
trip_distance            0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
payment_type             0
congestion_surcharge     0
Source_file              0
cab                      0
dtype: int64

In [11]:
df = df_cleaned
del df_cleaned

## Feature engineering

### Add in borough information

In [12]:
# Store Pickup location borough names and merge to main data frame:
df_location = pd.read_csv("Data/taxi_zones.csv")
df_location['DOLocationID'] = df_location['LocationID']
df_location = df_location.rename(columns = {'LocationID':'PULocationID'})

# Add borough and Zone name:
df = df.merge(df_location[['PULocationID', 'borough', 'zone']], on = "PULocationID", how = 'left')

### Add weather information

In [13]:
# Store Weather informaiton and merge to main data frame

df_weather = pd.read_csv("Data/CP.weather.df.csv")

# convert Date to datetime
df_weather['Date'] = pd.to_datetime(df_weather['Date'])
df_weather = df_weather.rename(columns={"Date":"date"})

df['date'] = pd.to_datetime(df['pickup_datetime'].dt.date)

# merge with pickup date
df = df.merge(df_weather, how= 'left')


In [14]:
# Summary stats and structure of data:
df.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,borough,zone,date,Precipitation,Snow.depth,Snowfall,Max.temp,Min.temp,Avg.wind.speed,Gust.speed
0,2.0,2019-07-14 01:24:36,2019-07-14 01:49:58,N,1.0,25,142,1.0,8.43,27.0,...,Brooklyn,Boerum Hill,2019-07-14,0.0,0.0,0.0,89.0,76.0,4.9,13.0
1,2.0,2019-02-19 11:49:32,2019-02-19 12:07:04,N,1.0,197,19,1.0,9.67,28.0,...,Queens,Richmond Hill,2019-02-19,0.0,0.0,0.0,36.0,23.0,,
2,2.0,2019-03-07 16:31:49,2019-03-07 16:44:39,N,1.0,130,132,1.0,5.21,16.0,...,Queens,Jamaica,2019-03-07,0.0,1.2,0.0,32.0,18.0,,
3,1.0,2019-07-02 21:14:37,2019-07-02 21:38:04,N,1.0,116,3,1.0,9.8,30.0,...,Manhattan,Hamilton Heights,2019-07-02,0.02,0.0,0.0,85.0,71.0,2.5,8.9
4,2.0,2019-05-15 00:22:26,2019-05-15 00:27:13,N,1.0,145,226,1.0,1.36,6.0,...,Queens,Long Island City/Hunters Point,2019-05-15,0.01,0.0,0.0,69.0,44.0,3.8,10.1


### Add the following variables

- trip time
- average speed
- day of week
- month
- hour
- holiday dummy code

In [15]:
# Include trip time:
df['trip_time'] = (df['dropoff_datetime'] - df['pickup_datetime']).astype('timedelta64[m]')
# filter out impossible trip times
df = df[df['trip_time'] > 0]

# Create average speed
df['avg_speed'] = df["trip_distance"]/(df["trip_time"]/60)

# Create day of pickup
df['day'] = df['pickup_datetime'].dt.day_name()

# Create hour of pick up 
df['hour'] = df['pickup_datetime'].dt.hour

# Create month of pick up 
df['month'] = df['pickup_datetime'].dt.month_name()

In [16]:
# Create holiday
cal = calendar()
holidays = cal.holidays(start=df['date'].min(), end=df['date'].max(), return_name=True)
holidays = holidays.reset_index(name='holiday').rename(columns={'index':'date'})
holidays['date'] = pd.to_datetime(holidays['date'])
df = pd.merge(df, holidays, on = 'date', how='left')

# add nye
df['holiday_NYE'] = np.where((pd.to_datetime(df['date']).dt.month == 12) & \
                     (pd.to_datetime(df['date']).dt.day == 31), 1, 0)


In [17]:
df.head()

Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,Min.temp,Avg.wind.speed,Gust.speed,trip_time,avg_speed,day,hour,month,holiday,holiday_NYE
0,2.0,2019-07-14 01:24:36,2019-07-14 01:49:58,N,1.0,25,142,1.0,8.43,27.0,...,76.0,4.9,13.0,25.0,20.232,Sunday,1,July,,0
1,2.0,2019-02-19 11:49:32,2019-02-19 12:07:04,N,1.0,197,19,1.0,9.67,28.0,...,23.0,,,17.0,34.129412,Tuesday,11,February,,0
2,2.0,2019-03-07 16:31:49,2019-03-07 16:44:39,N,1.0,130,132,1.0,5.21,16.0,...,18.0,,,12.0,26.05,Thursday,16,March,,0
3,1.0,2019-07-02 21:14:37,2019-07-02 21:38:04,N,1.0,116,3,1.0,9.8,30.0,...,71.0,2.5,8.9,23.0,25.565217,Tuesday,21,July,,0
4,2.0,2019-05-15 00:22:26,2019-05-15 00:27:13,N,1.0,145,226,1.0,1.36,6.0,...,44.0,3.8,10.1,4.0,20.4,Wednesday,0,May,,0


## Varailbe Pre-Processing

* limit df to only features and outcome variable 
* one-hot encode all categorical variables
* remove reference classes when nessesary 
* Pull out final test set
* Create Train and Validation sets
* Create Scale function from training data
* apply scale function to train, validation and test sets

In [18]:
df.describe
df.isna().sum()

VendorID                       0
pickup_datetime                0
dropoff_datetime               0
store_and_fwd_flag             0
RatecodeID                     0
PULocationID                   0
DOLocationID                   0
passenger_count                0
trip_distance                  0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
improvement_surcharge          0
total_amount                   0
payment_type                   0
congestion_surcharge           0
Source_file                    0
cab                            0
borough                    26879
zone                       26879
date                           0
Precipitation                  3
Snow.depth                     3
Snowfall                   15547
Max.temp                       3
Min.temp                       3
Avg.wind.speed            956710
Gust.speed                956710
trip_time 

In [19]:
# retain only neccessary variables

cols = ['tip_amount', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tolls_amount', 'improvement_surcharge', 'congestion_surcharge',  'cab', 'borough',
        'trip_time', 'avg_speed','month', 'day', 'hour','holiday','holiday_NYE', 'zone',
        'Precipitation', 'Snow.depth', 'Snowfall', 'Max.temp', 'Min.temp','Avg.wind.speed', 'Gust.speed']

df = df.loc[:, cols]

del cols


In [20]:
# one-hot encode categorical variables

# create dummys without na class
df = pd.get_dummies(df, dummy_na= False,  \
                    columns= ['passenger_count','cab', 'day','hour','month','improvement_surcharge',
                              'congestion_surcharge', 'mta_tax', 'extra'])

# create dummies with na class
df = pd.get_dummies(df, dummy_na= True, columns= ['holiday','zone','borough'])

# clean up column names
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('.0$', '')
df.columns = df.columns.str.replace('/', '_')
df.columns = df.columns.str.replace('.', '_')

### Split into train validation and test sets

In [29]:
# pull 10% of data to store as test
test_df = df.sample(frac=0.1, random_state=44)

# remove test set from data
df = df.drop(test_df.index)

# create train set with 70% of remaining cases
train_df = df.sample(frac=.7, random_state=44)

# create validation set
validation_df = df.drop(train_df.index)

### Scale Continous Values

In [30]:
# obtain scales from train set

from sklearn import preprocessing

continous = train_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']]

scaler = preprocessing.MinMaxScaler().fit(continous)
continous = scaler.transform(continous)


train_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']] = continous

In [31]:
# apply scale to validation and test set

# validation
from sklearn import preprocessing

apply_scale = validation_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']]

apply_scale = scaler.transform(apply_scale)


validation_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']] = apply_scale

# test 
apply_scale = test_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']]

apply_scale = scaler.transform(apply_scale)


test_df[['trip_distance', 'fare_amount','tolls_amount', 
                      'trip_time','avg_speed','Precipitation', 
                      'Snow_depth', 'Snowfall','Max_temp',
                      'Min_temp','Avg_wind_speed','Gust_speed']] = apply_scale


In [32]:
validation_df

Unnamed: 0,tip_amount,trip_distance,fare_amount,tolls_amount,trip_time,avg_speed,holiday_NYE,Precipitation,Snow_depth,Snowfall,...,zone_Yorkville_East,zone_Yorkville_West,zone_nan,borough_Bronx,borough_Brooklyn,borough_EWR,borough_Manhattan,borough_Queens,borough_Staten_Island,borough_nan
1,0.00,0.343162,0.304272,0.000000,0.006390,0.024518,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,0,1,0,0
2,2.67,0.184725,0.173823,0.000000,0.004393,0.018714,0,0.000000,0.307692,0.000,...,0,0,0,0,0,0,0,1,0,0
9,5.48,0.230906,0.217306,0.098710,0.005192,0.020043,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,1,0,0,0
11,0.00,0.092007,0.135776,0.000000,0.005591,0.007471,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,0,1,0,0
12,0.00,0.500533,0.483640,0.000000,0.018371,0.012931,0,0.000000,0.000000,0.000,...,0,0,0,0,1,0,0,0,0,0
17,2.31,0.063588,0.086857,0.000000,0.002396,0.011083,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,1,0,0,0
23,10.10,0.493428,0.510816,0.000000,0.019569,0.011982,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,1,0,0,0
28,3.06,0.100888,0.157517,0.000000,0.007188,0.006465,0,0.000000,0.000000,0.000,...,0,0,0,0,1,0,0,0,0,0
33,5.21,0.192895,0.244483,0.000000,0.011981,0.007564,0,0.295082,0.000000,0.000,...,0,0,0,0,0,0,1,0,0,0
37,1.00,0.040142,0.081422,0.000000,0.003195,0.005459,0,0.000000,0.000000,0.000,...,0,0,0,0,0,0,1,0,0,0


### Write dfs to Data Directory 

In [None]:
test_df.to_csv('Data/test.csv')
validation_df.to_csv('Data/validation.csv')
train_df.to_csv('Data/train.csv')