In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

## Read in weather data

In [2]:
#read the weather data file

df_weather = pd.read_csv('weather_edit.csv')

In [3]:
df_weather.tail()

Unnamed: 0,pickup_datetime,tempm,precipm,precip_type
535,2016-12-17 20:51:00,2.8,,clear
536,2016-12-17 21:51:00,2.8,,clear
537,2016-12-17 22:17:00,3.3,,clear
538,2016-12-17 22:51:00,3.9,,clear
539,2016-12-17 23:51:00,3.9,,clear


In [4]:
#fill the None entries in precipm with 0

df_weather.precipm = df_weather.precipm.fillna(0)

In [5]:
%%time
df_weather['pickup_datetime'] = pd.to_datetime(df_weather.pickup_datetime)

CPU times: user 3.13 ms, sys: 989 µs, total: 4.11 ms
Wall time: 3.27 ms


## Read in taxi data

In [6]:
#read in the edited taxi data
chunklist = []
for chunk in pd.read_csv('taxi_edit.csv', chunksize = 100000):
    chunklist.append(chunk)

In [7]:
df_taxi = pd.concat(chunklist)

In [8]:
df_taxi.tail()

Unnamed: 0,pickup,dropoff,trip_distance,trip_time,speed
1688275,2016-12-17 23:59:58,2016-12-18 00:30:51,4.4,0 days 00:30:53.000000000,8.5483
1688276,2016-12-17 23:59:58,2016-12-18 00:22:15,10.2,0 days 00:22:17.000000000,27.464473
1688277,2016-12-17 23:59:58,2016-12-18 00:14:03,1.4,0 days 00:14:05.000000000,5.964497
1688278,2016-12-17 23:59:59,2016-12-18 00:13:56,2.4,0 days 00:13:57.000000000,10.322581
1688279,2016-12-17 23:59:59,2016-12-18 00:06:58,1.6,0 days 00:06:59.000000000,13.747017


In [9]:
%%time
#convert pickup_datetime string to datetime 
df_taxi['pickup'] = pd.to_datetime(df_taxi['pickup'])
df_taxi['dropoff'] = pd.to_datetime(df_taxi['dropoff'])

CPU times: user 1.78 s, sys: 173 ms, total: 1.95 s
Wall time: 1.6 s


In [10]:
df_taxi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1688280 entries, 0 to 1688279
Data columns (total 5 columns):
pickup           1688280 non-null datetime64[ns]
dropoff          1688280 non-null datetime64[ns]
trip_distance    1688280 non-null float64
trip_time        1688280 non-null object
speed            1684583 non-null float64
dtypes: datetime64[ns](2), float64(2), object(1)
memory usage: 64.4+ MB


In [11]:
#drop the rows that have None values
df_taxi.dropna(axis = 0, inplace = True)

## Merge the two dataframes on the nearest pickup datetime.

In [12]:
df_taxi.head()

Unnamed: 0,pickup,dropoff,trip_distance,trip_time,speed
0,2016-02-05 00:00:00,2016-02-05 00:08:02,1.5,0 days 00:08:02.000000000,11.20332
1,2016-02-05 00:00:01,2016-02-05 00:14:36,3.0,0 days 00:14:35.000000000,12.342857
2,2016-02-05 00:00:02,2016-02-05 00:21:48,4.0,0 days 00:21:46.000000000,11.026034
3,2016-02-05 00:00:02,2016-02-05 00:02:20,0.9,0 days 00:02:18.000000000,23.478261
4,2016-02-05 00:00:03,2016-02-05 00:05:17,0.6,0 days 00:05:14.000000000,6.878981


In [13]:
df_weather.head()

Unnamed: 0,pickup_datetime,tempm,precipm,precip_type
0,2016-02-04 23:51:00,6.7,0.0,clear
1,2016-02-05 00:51:00,5.0,0.3,clear
2,2016-02-05 01:51:00,4.4,0.0,rain
3,2016-02-05 02:51:00,2.8,0.5,snow
4,2016-02-05 03:40:00,1.7,1.0,snow


In [14]:
df_weather.rename(columns = {'pickup_datetime':'pickup'}, inplace = True)

In [15]:
df_weather.head()

Unnamed: 0,pickup,tempm,precipm,precip_type
0,2016-02-04 23:51:00,6.7,0.0,clear
1,2016-02-05 00:51:00,5.0,0.3,clear
2,2016-02-05 01:51:00,4.4,0.0,rain
3,2016-02-05 02:51:00,2.8,0.5,snow
4,2016-02-05 03:40:00,1.7,1.0,snow


In [16]:
%%time 
df_merged = pd.merge_asof(df_taxi, df_weather, on = 'pickup', direction = 'nearest')

CPU times: user 307 ms, sys: 76.1 ms, total: 383 ms
Wall time: 415 ms


In [17]:
df_merged.head()

Unnamed: 0,pickup,dropoff,trip_distance,trip_time,speed,tempm,precipm,precip_type
0,2016-02-05 00:00:00,2016-02-05 00:08:02,1.5,0 days 00:08:02.000000000,11.20332,6.7,0.0,clear
1,2016-02-05 00:00:01,2016-02-05 00:14:36,3.0,0 days 00:14:35.000000000,12.342857,6.7,0.0,clear
2,2016-02-05 00:00:02,2016-02-05 00:21:48,4.0,0 days 00:21:46.000000000,11.026034,6.7,0.0,clear
3,2016-02-05 00:00:02,2016-02-05 00:02:20,0.9,0 days 00:02:18.000000000,23.478261,6.7,0.0,clear
4,2016-02-05 00:00:03,2016-02-05 00:05:17,0.6,0 days 00:05:14.000000000,6.878981,6.7,0.0,clear


In [18]:
df_merged.to_csv('data_merged.csv', index = False)