In [1]:
import pandas as pd
import numpy as np
from scipy import stats 
from datetime import datetime

## Read in the raw taxi data.  Since the data is quite large we need to load it in chunks

In [2]:
%%time
#read the original taxi trip data csv file into chunklist
chunklist = []
for chunk in pd.read_csv('2016_Yellow_Taxi_Trip_Data.csv', chunksize=100000):
     chunklist.append(chunk)

CPU times: user 4min 2s, sys: 45.1 s, total: 4min 47s
Wall time: 4min 50s


In [3]:
%%time
for chunk in chunklist:
    chunk['tpep_pickup_datetime'] = pd.to_datetime(chunk['tpep_pickup_datetime'], infer_datetime_format = True)
    chunk['tpep_dropoff_datetime'] = pd.to_datetime(chunk['tpep_dropoff_datetime'], infer_datetime_format = True)

CPU times: user 4h 31min 13s, sys: 49.1 s, total: 4h 32min 2s
Wall time: 4h 32min 38s


In [4]:
%%time
#combine the chunks into a dataframe
df_taxi = pd.concat(chunklist)

CPU times: user 25.6 s, sys: 1min 20s, total: 1min 45s
Wall time: 3min 18s


## Read in weather data 

In [6]:
df_weather = pd.read_csv('weather_edit.csv')

### Change the pickup datetime column name so that the taxi and weather data can be merged

In [8]:
df_weather.rename(columns = {'pickup_datetime' : 'pickup'}, inplace = True)

In [9]:
df_weather['pickup'] = pd.to_datetime(df_weather['pickup'])

In [10]:
df_taxi.rename(columns = {'tpep_pickup_datetime' : 'pickup'}, inplace = True)

### Sort the taxi data by pickup

In [11]:
%%time
df_taxi.sort_values(by = ['pickup'], inplace = True)

CPU times: user 1min 20s, sys: 1min 7s, total: 2min 28s
Wall time: 3min 25s


## Merge the Taxi and Weather data on the pickup column

In [12]:
%%time 
df_merged = pd.merge_asof(df_taxi, df_weather, on = 'pickup', direction = 'nearest')

CPU times: user 1min 9s, sys: 3min 33s, total: 4min 42s
Wall time: 7min 31s


## Write the merged data to a csv file

In [14]:
df_merged.to_csv('data_merged_all.csv', index = False)