In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
%%time
#read the original taxi trip data csv file into chunklist
chunklist = []
for chunk in pd.read_csv('2016_Yellow_Taxi_Trip_Data.csv', chunksize=100000):
     chunklist.append(chunk)

CPU times: user 4min 7s, sys: 49 s, total: 4min 56s
Wall time: 5min 6s


In [3]:
%%time
#combine the chunks into a dataframe
df_taxi = pd.concat(chunklist)

CPU times: user 49.2 s, sys: 3min 3s, total: 3min 53s
Wall time: 7min 59s


In [4]:
df_taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,...,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,PULocationID,DOLocationID
0,1,02/22/2016 06:14:21 PM,02/22/2016 06:27:21 PM,1,4.4,-74.012749,40.70261,1,N,-73.987106,...,1,15.0,1.0,0.5,1.0,0.0,0.3,17.8,,
1,1,02/10/2016 05:51:14 PM,02/10/2016 05:58:34 PM,1,1.1,-73.986183,40.757881,1,N,-73.98172,...,1,6.5,1.0,0.5,1.65,0.0,0.3,9.95,,
2,1,02/13/2016 02:56:14 PM,02/13/2016 03:13:35 PM,1,6.6,-73.98114,40.782425,1,N,-73.930641,...,2,22.0,0.0,0.5,0.0,0.0,0.3,22.8,,
3,1,02/15/2016 01:29:39 PM,02/15/2016 01:45:53 PM,2,2.2,-73.998619,40.764027,1,N,-73.98175,...,2,12.0,0.0,0.5,0.0,0.0,0.3,12.8,,
4,1,02/27/2016 11:59:52 PM,02/28/2016 12:04:18 AM,1,1.2,-73.959244,40.763435,1,N,-73.946564,...,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,,


In [5]:
df_taxi.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'PULocationID',
       'DOLocationID'],
      dtype='object')

In [6]:
#drop the columns that we don't need to keep
df_taxi = df_taxi.drop(columns=['VendorID', 'passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'payment_type',
                               'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 
                                'improvement_surcharge', 'total_amount', 'pickup_longitude', 'pickup_latitude', 
                                'dropoff_longitude', 'dropoff_latitude'])

In [7]:
df_taxi.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID
0,02/22/2016 06:14:21 PM,02/22/2016 06:27:21 PM,4.4,,
1,02/10/2016 05:51:14 PM,02/10/2016 05:58:34 PM,1.1,,
2,02/13/2016 02:56:14 PM,02/13/2016 03:13:35 PM,6.6,,
3,02/15/2016 01:29:39 PM,02/15/2016 01:45:53 PM,2.2,,
4,02/27/2016 11:59:52 PM,02/28/2016 12:04:18 AM,1.2,,


In [8]:
%%time
#create a list of the differences between dropoff and pickup datetimes
#converts datetime string to datetime 
trip_times = pd.to_datetime(df_taxi['tpep_dropoff_datetime']) - pd.to_datetime(df_taxi['tpep_pickup_datetime'])

CPU times: user 4h 43min 27s, sys: 1min 22s, total: 4h 44min 50s
Wall time: 4h 46min 23s


In [9]:
#check the list
trip_times[0:5]

0   00:13:00
1   00:07:20
2   00:17:21
3   00:16:14
4   00:04:26
dtype: timedelta64[ns]

In [10]:
#create a list of the speed
#use trip_distance / (triptime in seconds / 3600)
speed_list = []
for i in range(len(trip_times)):
    speed_list.append(df_taxi.loc[i,'trip_distance'] / (trip_times[i].total_seconds()/3600))


  """
  """


In [11]:
speed_list[0:5]

[20.30769230769231,
 9.000000000000002,
 22.824207492795388,
 8.131416837782341,
 16.240601503759397]

In [12]:
%%time
#create speed column in df_taxi dataframe
df_taxi['speed'] = speed_list

CPU times: user 12 s, sys: 17.9 s, total: 29.9 s
Wall time: 36.8 s


In [13]:
df_taxi.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID,speed
0,02/22/2016 06:14:21 PM,02/22/2016 06:27:21 PM,4.4,,,20.307692
1,02/10/2016 05:51:14 PM,02/10/2016 05:58:34 PM,1.1,,,9.0
2,02/13/2016 02:56:14 PM,02/13/2016 03:13:35 PM,6.6,,,22.824207
3,02/15/2016 01:29:39 PM,02/15/2016 01:45:53 PM,2.2,,,8.131417
4,02/27/2016 11:59:52 PM,02/28/2016 12:04:18 AM,1.2,,,16.240602


In [15]:
#see what the speed column looks like
df_taxi.speed.describe()
#the max and min do not make any sense
#anything <0 does not make sense, also anything over 60 is extremely fast for a city like new york

count    6.094906e+07
mean              inf
std               NaN
min     -3.698819e+08
25%      7.309645e+00
50%      1.005587e+01
75%      1.382199e+01
max               inf
Name: speed, dtype: float64

In [16]:
#drop rows where speed > 60
df_taxi = df_taxi.drop(df_taxi[df_taxi.speed > 60].index)

In [17]:
#drop rows where speed < 0
df_taxi = df_taxi.drop(df_taxi[df_taxi.speed < 0].index)

In [18]:
df_taxi.speed.describe()

count    6.083116e+07
mean     1.138239e+01
std      6.206513e+00
min      0.000000e+00
25%      7.305524e+00
50%      1.004651e+01
75%      1.378896e+01
max      6.000000e+01
Name: speed, dtype: float64

In [19]:
%%time
#write the dataframe to a csv file
df_taxi.to_csv('taxi_edit.csv', index = False)

CPU times: user 8min 43s, sys: 1min 22s, total: 10min 6s
Wall time: 11min 51s
