# Problem 1: When should I go by taxi to be fastest at my destination?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
import os
nytaxi_directory='/srv/taxi-data-csv'

In [None]:
df = pd.read_csv(os.path.join(nytaxi_directory, 'yellow_tripdata_2017-01.csv'), nrows=100000)

In [None]:
#let's have a look at the types
df.dtypes

In [None]:
# the datetime columns should be parsed as datetimes
df = pd.read_csv(os.path.join(nytaxi_directory, 'yellow_tripdata_2017-01.csv'), nrows=1000000, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [None]:
#let's calculate the trip duration from the pickup and drop down times and store it as a new column
df['trip_duration'] = (df.tpep_dropoff_datetime-df.tpep_pickup_datetime).dt.total_seconds()

In [None]:
#plot it
df.trip_duration.hist(bins=100)

In [None]:
# zoomed in it reveals a very nice distribution
df[df['trip_duration']<4000].trip_duration.hist(bins=100)

In [None]:
# and an interesting structure in the distance distribution
df[df['trip_distance']<40].trip_distance.hist(bins=100)

In [None]:
# what is the median duration?
df['trip_duration'].median()/60.0

In [None]:
# let's cut away again the outliers
df_cut = df[(df['trip_duration']>0)&(df['trip_duration']<4000)&(df.trip_distance<25)]

In [None]:
plt.scatter(df_cut.trip_duration, df_cut.trip_distance, s=1)

In [None]:
heatmap_df = df_cut.groupby([pd.cut(df_cut.trip_distance, 100), pd.cut(df_cut.trip_duration, 100)]).tip_amount.count()
from matplotlib.colors import LogNorm
import seaborn as sns
ax = sns.heatmap(heatmap_df.unstack(), norm=LogNorm(vmin=heatmap_df.min(), vmax=heatmap_df.max()))

In [None]:
# calcuate and storing the speed 
df['speed']=(df.trip_distance/df.trip_duration*3600)

In [None]:
#let's "zoom in" into that interesting structure
df_cut = df[(df['trip_duration']>0)&(df['trip_duration']<4000)&(df.trip_distance>8)&(df.trip_distance<25)]
df_cut = df_cut[df_cut.speed<70]

In [None]:
# is the speed distribution diffent for longer and shorter rides?
fig, ax = plt.subplots()
#ax.hist([df[df.trip_distance<15], df[df.trip_distance>15]])
df_cut[df_cut.trip_distance<15].speed.hist(ax=ax, bins=100, alpha = 0.5)
df_cut[df_cut.trip_distance>15].speed.hist(ax=ax, bins=100, alpha = 0.5)
#ax.set_yscale('log')

In [None]:
# now let's look for traffic jams: is there a speed dependency on the time of day?
df_cut = df[(df['trip_duration']>0)&(df['trip_duration']<4000)&(df.trip_distance<25)]
df_cut = df_cut[df_cut.speed<70]
df_cut.tpep_dropoff_datetime.dt.hour.hist(bins=100)

In [None]:
plt.scatter(df_cut.tpep_dropoff_datetime.dt.hour, df_cut.speed, s=1)

In [None]:
sns.regplot(x=df_cut.tpep_pickup_datetime.dt.hour, y=df_cut.speed, x_bins=50, fit_reg=None)

In [None]:
heatmap_df = df_cut.groupby([pd.cut(df_cut.tpep_pickup_datetime.dt.hour, 100), pd.cut(df_cut.speed, 100)]).speed.count()

ax = sns.heatmap(heatmap_df.unstack(), norm=LogNorm(vmin=heatmap_df.min(), vmax=heatmap_df.max()))

# Now it's your turn!
Explore a bit further:
* can you spot interesting correlations with the speed?
* how is distance correlated with the costs of a ride?
* are tips in high speed taxis higher?
