# New York Green Cab 2015/09 data

### data source: http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml

In [None]:
import pickle
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib
from dateutil import parser
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV 
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
%matplotlib inline

print('done')

In [None]:
# load data, check the dataframe info and shape

df = pd.read_csv('../data/green_tripdata_2015-09.csv')
print('The shape of the dataframe is', np.shape(df))

In [None]:
# extract time details from the original dataset

df['lpep_pickup_day'] = df.lpep_pickup_datetime.apply(lambda x: parser.parse(x).day)
df['lpep_pickup_hour'] = df.lpep_pickup_datetime.apply(lambda x: parser.parse(x).hour)
df['lpep_pickup_min'] = df.lpep_pickup_datetime.apply(lambda x: parser.parse(x).minute)
df['lpep_pickup_sec'] = df.lpep_pickup_datetime.apply(lambda x: parser.parse(x).second)

df['lpep_dropoff_day'] = df.Lpep_dropoff_datetime.apply(lambda x: parser.parse(x).day)
df['lpep_dropoff_hour'] = df.Lpep_dropoff_datetime.apply(lambda x: parser.parse(x).hour)
df['lpep_dropoff_min'] = df.Lpep_dropoff_datetime.apply(lambda x: parser.parse(x).minute)
df['lpep_dropoff_sec'] = df.Lpep_dropoff_datetime.apply(lambda x: parser.parse(x).second)

df['weekend_holiday_flag'] = df.lpep_pickup_day.apply(lambda x: x in [4,5,6,12,13,19,20,26,27])

# save cleaned csv file
df.to_csv('../data/green_cleaned.csv',index=False)

In [None]:
# load the data
df = pd.read_csv('../data/green_cleaned.csv')

**Summary:**
* The original dataset has 1494926 rows and 21 columns.
* I parsed the two columns: lpep_pickup_datetime and Lpep_dropoff_datetime to extract the days and time of all pick-ups and drop-offs, as well as the weekend_flag (9 new columns add in the cleaned dataset).

In [None]:
# histogram plot of Trip_distance

# if all data points are included
print(np.max(df.Trip_distance))
print(np.min(df.Trip_distance))
print(len(df[df.Trip_distance == 0.001]))

trip_distance = np.array(df.Trip_distance)
# set trip_distance = 0 as 0.001 so it could be shown in log scale, even with couple outliers
trip_distance[trip_distance == 0] = 0.001

n_bins = 50
# there are couple trips have very long trip distance
# take log10 on both trip distance and the counts to clearly show the trend
plt.figure(figsize = (8,5))
n, bins, patches = plt.hist(np.log10(trip_distance), n_bins, alpha = 0.5, 
                           edgecolor='k', color='green', log=True)
#print(n)
plt.xlabel('Trip distance (Miles)')
plt.ylabel('count')
plt.xticks([-3,-2,-1,0,1,2,3],['0.001(0)','0.01','0.1','1','10','100','1000'])
plt.title('Histogram: trip distance of new york green cab')

plt.savefig('../figs_reports/trip_distance_contributions.png',dpi=200)

# if we only explore the trip_distance <= 20
plt.figure(figsize = (8,5))
n, bins, patches = plt.hist(trip_distance[trip_distance <= 20], 
                            n_bins, alpha = 0.5, edgecolor='k', color='green')
plt.xlabel('Trip distance (Miles)')
plt.ylabel('count')
plt.title('Histogram: trip distance (shorter than 20 miles) of new york green cab')
plt.savefig('../figs_reports/trip_distance_contributions_without_outlier.png',dpi=200)

**Summary:**

* Most of the trip distance records (~98% of the total records) varies roughly between 0.3 miles to 15 miles. 
* There are ~20000 records that have a trip distance of 0 miles. These records might come from the situation that people asked for a taxi but eventually did not take it. 
* There are only couple records that have very long trip distances (longer than 100 miles). The maximum of the trip distance is 603.1 miles.
* The hypothesis: The trip distances are not random (long trips excluded). The distribution is skewed to the right. If they were random, a Gaussian distribution should be expected. This observation might be related the fact that majority of the rides are within the local areas (i.e. Manhattan).

In [None]:
# calculate the mean and median trip distance grouped by hour of the day
# I use the pick_up time for grouping

hour_mean = df.groupby('lpep_pickup_hour', as_index=False)['Trip_distance'].mean().round(2)
hour_median = df.groupby('lpep_pickup_hour', as_index=False)['Trip_distance'].median()

# mean plot
fig, ax = plt.subplots(1,2,figsize = (14,4))
ax[0].scatter(hour_mean['lpep_pickup_hour'], hour_mean['Trip_distance'], c="red")
ax[0].axvspan(-0.5, 0.5, alpha=0.3, color='red')
ax[0].axvspan(2.5, 7.5, alpha=0.3, color='red')
ax[0].axvspan(21.5, 23.5, alpha=0.3, color='red')
ax[0].set_xticklabels(['','0:00','5:00','10:00','15:00','20:00',''])
ax[0].set_xlabel('Hour of the day')
ax[0].set_ylabel('Mean trip distance (Mile)')
ax[0].set_title('Mean trip distance in different hours of the day during sept. 2015')

# median plot
ax[1].scatter(hour_median['lpep_pickup_hour'], hour_median['Trip_distance'], c="green")
ax[1].axvspan(-0.5, 0.5, alpha=0.3, color='green')
ax[1].axvspan(2.5, 7.5, alpha=0.3, color='green')
ax[1].axvspan(21.5, 23.5, alpha=0.3, color='green')
ax[1].set_xticklabels(['','0:00','5:00','10:00','15:00','20:00',''])
ax[1].set_xlabel('Hour of the day')
ax[1].set_ylabel('Median trip distance (Mile)')
ax[1].set_title('Median trip distance in different hours of the day during sept. 2015')

plt.savefig('../figs_reports/trip_distance_hour_ditribution.png',dpi=200)

In [None]:
# trips from/to NYC area airports

# select airport trips using the RateCodeID (code 2 for JFK, code 3 for Neward)
airport_trips = df[(df.RateCodeID == 2) | (df.RateCodeID == 3)]
print("Number of trips to/from NYC airports: ", airport_trips.shape[0])

# average fare of these trips
print("Average time-distance fare (calculated by the meter) of these trips: $", round(airport_trips.Fare_amount.mean(), 2),"per trip.")
print("Average total charged amount (without cash tip) of these trips: $", round(airport_trips.Total_amount.mean(), 2),"per trip.")

In [None]:
# explore other features of the trips from/to NYC area airports

fig, ax = plt.subplots(1,2,figsize = (14,4))

# airport trip vs. non-airport trip counts during different hours of the day
airport_trips.lpep_pickup_hour.value_counts(normalize=True).sort_index().plot(ax=ax[0])
df.loc[~ df.index.isin(airport_trips.index),'lpep_pickup_hour'].value_counts(normalize=True).sort_index().plot(ax=ax[0])

ax[0].set_xlabel('Hour of the day')
ax[0].set_ylabel('Normalized trips count')
ax[0].set_xticklabels(['0:00','5:00','10:00','15:00','20:00',''])
ax[0].set_title('Trip counts distribution during the day')
ax[0].legend(['Airport trips','Non-airport trips'],loc='best')

# cairport trip vs. non-airport trip counts during different days of the month
# check the pickup day if a weekend
airport_trips.lpep_pickup_day.value_counts(normalize=True).sort_index().plot(ax=ax[1])
df.loc[~ df.index.isin(airport_trips.index),'lpep_pickup_day'].value_counts(normalize=True).sort_index().plot(ax=ax[1])
ax[1].set_xlabel('Days of September 2015')
ax[1].set_ylabel('Normalized trips count')
ax[1].set_title('Trip counts distribution during the month')
ax[1].legend(['Airport trips','Non-airport trips'],loc='best')
ax[1].axvspan(4.5, 6.5, alpha=0.3, color='green')
ax[1].axvspan(11.5, 13.5, alpha=0.3, color='green')
ax[1].axvspan(18.5, 20.5, alpha=0.3, color='green')
ax[1].axvspan(25.5, 27.5, alpha=0.3, color='green')

plt.savefig('../figs_reports/airport_travel_features.png',dpi=200)


**Summary:**

**Mean and median of the day:** 
* Long range trips are in the early morning and around mid-night (as shown by the red and green bands in the figure). The morning peak may be closely related to the morning rush hour commute. Interestingly, the evening peak is lower than the morning peak, and it shows up around mid-night (not evening rush hour). 
* 1) People try to take cabs in the morning to avoid being late to work, and they feel ok to take public transportation after work. 
* 2) The evening peak might be caused by those people who go home from their happy hour, party, or other evening entertainments. And those people are probably living close to where they enjoy their evening life. 

**Trips that originate or terminate at one of the NYC area airports:**
* According to the dictionary of variables, RateCodeID indicates the final rate that was applied. Among those values, JFK (2) and Newark (3), which are the 2 major airports in New York, are included. I used this code to select the trip from/to the airports.
* Number of trips to/from NYC airports:  5552
* Average time-distance fare (calculated by the meter) of these trips: 48.98 USD/trip.
* Average total charged amount (without cash tip) of these trips: 57.21 USD/trip.
**Interesting findings:**
* 1) The hourly distribution shows that the number of airport trips peaks around 15PM, while the non-airport trips peaks ~2 hours later (rush hours). Also, the lowest count of airport rides shows at 2AM about when airports shut down. Other non-airport rides, on the other hand, goes almost completely down at about 5AM.
* 2) The day-by-day distribution during the month shows **a very strong weekly seasonality**. The peaks of both airport and non-airport rides roughly show in Fridays and Saturdays, and the counts of both of the rides drop dramatically in Sundays.
* 3) The count of the airport rides shows its maximum on Sept. 4th (Labor Day long weekend). People like to travel around during the long weekend!