QUICK NOTES: convert month, day, hour, etc. to object dtype??

 # This notebook is to join both datasets: Taxis and Weather

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import date
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
# Let´s start by joining just one year: 2019

## Taxis Dataset: Import & Sanity check:
- Confirm that the year is correct.
- Confirm that hourly periods count is correct

In [35]:
year = 2017
dftax = pd.read_csv('../data/Data_Taxis_'+str(year)+'_Cleaned.csv', sep=',',
                        dtype = {"PULocationID" : "object"},
                        parse_dates={'datetime':['pickup_datetime']})
print('Year should be unique: ', dftax.year.unique())

dftax['hourlyperiods'] = 1
h = dftax.groupby(['month','day','hour'])['hourlyperiods'].sum()
print('There should be 8760 hourly periods in a year: ', h.shape[0])

dftax.head(1000)

Year should be unique:  [2017]
There should be 8760 hourly periods in a year:  8760


Unnamed: 0,datetime,PULocationID,NoOfPickups,year,month,day,hour,week,dayofweek,isweekend,IsHoliday,hourlyperiods
0,2017-01-01 00:00:00,4,136,2017,1,1,0,52,6,1,0,1
1,2017-01-01 00:00:00,7,77,2017,1,1,0,52,6,1,0,1
2,2017-01-01 00:00:00,12,3,2017,1,1,0,52,6,1,0,1
3,2017-01-01 00:00:00,13,103,2017,1,1,0,52,6,1,0,1
4,2017-01-01 00:00:00,14,3,2017,1,1,0,52,6,1,0,1
5,2017-01-01 00:00:00,17,16,2017,1,1,0,52,6,1,0,1
6,2017-01-01 00:00:00,20,2,2017,1,1,0,52,6,1,0,1
7,2017-01-01 00:00:00,24,94,2017,1,1,0,52,6,1,0,1
8,2017-01-01 00:00:00,25,31,2017,1,1,0,52,6,1,0,1
9,2017-01-01 00:00:00,33,31,2017,1,1,0,52,6,1,0,1


## Weather Dataset: Import one year and Sanity check

In [21]:
# Filter one year data

In [10]:
# Import WEATHER DATASET to dataframe.
dfwea = pd.read_csv('../data/Data_Weather_Cleaned.csv', sep=',',
                        parse_dates={'datetime':['DATE']})

# Filter one year data
dfwea.drop(dfwea[dfwea['datetime'] < pd.Timestamp(date(year,1,1))].index, inplace=True)
dfwea.drop(dfwea[dfwea['datetime'] >= pd.Timestamp(date(year+1,1,1))].index, inplace=True)

# Sanity check
print('Year should be unique: ', dfwea.datetime.dt.year.unique())
print('There should be 8760 hourly periods in a year: ', dfwea.shape[0])

dfwea.head()

Year should be unique:  [2017]
There should be 8760 hourly periods in a year:  8760


Unnamed: 0,datetime,HourlyPrecipitation
0,2017-01-01 00:00:00,0.0
1,2017-01-01 01:00:00,0.0
2,2017-01-01 02:00:00,0.0
3,2017-01-01 03:00:00,0.0
4,2017-01-01 04:00:00,0.0


## Weather vs Taxis Danity Check

In [20]:
tyear = dftax.year.unique()
wyear = dfwea.datetime.dt.year.unique()
print('Taxis year ({0}) equals Weather year({1})? {2}'.format(tyear,wyear,tyear==wyear))
print('Taxis rows ({0}) equals Weather rows ({1})? {2}'.format(h.shape[0],dfwea.shape[0],h.shape[0]==dfwea.shape[0]))

Taxis year ([2017]) equals Weather year([2017])? [ True]
Taxis rows (8760) equals Weather rows (8760)? True


## Merge Taxis and Weather datasets

In [29]:
dftax.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036854 entries, 0 to 1036853
Data columns (total 12 columns):
pickup_datetime    1036854 non-null datetime64[ns]
PULocationID       1036854 non-null object
NoOfPickups        1036854 non-null int64
year               1036854 non-null int64
month              1036854 non-null int64
day                1036854 non-null int64
hour               1036854 non-null int64
week               1036854 non-null int64
dayofweek          1036854 non-null int64
isweekend          1036854 non-null int64
IsHoliday          1036854 non-null int64
hourlyperiods      1036854 non-null int64
dtypes: datetime64[ns](1), int64(10), object(1)
memory usage: 94.9+ MB


In [30]:
dfwea.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 2 columns):
datetime               8760 non-null datetime64[ns]
HourlyPrecipitation    8760 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 205.3 KB


In [41]:
df_merge = pd.merge(dftax, dfwea, on='datetime')
# taxis dataframe and merged dataframe should have same number of rows
print(df_merge.shape[0], dftax.shape[0])
df_merge.head(100)

1036854 1036854


Unnamed: 0,datetime,PULocationID,NoOfPickups,year,month,day,hour,week,dayofweek,isweekend,IsHoliday,hourlyperiods,HourlyPrecipitation
0,2017-01-01,4,136,2017,1,1,0,52,6,1,0,1,0.0
1,2017-01-01,7,77,2017,1,1,0,52,6,1,0,1,0.0
2,2017-01-01,12,3,2017,1,1,0,52,6,1,0,1,0.0
3,2017-01-01,13,103,2017,1,1,0,52,6,1,0,1,0.0
4,2017-01-01,14,3,2017,1,1,0,52,6,1,0,1,0.0
5,2017-01-01,17,16,2017,1,1,0,52,6,1,0,1,0.0
6,2017-01-01,20,2,2017,1,1,0,52,6,1,0,1,0.0
7,2017-01-01,24,94,2017,1,1,0,52,6,1,0,1,0.0
8,2017-01-01,25,31,2017,1,1,0,52,6,1,0,1,0.0
9,2017-01-01,33,31,2017,1,1,0,52,6,1,0,1,0.0


In [44]:
# save csv
df_merge.to_csv('../data/Data_Cleaned_'+str(year)+'_To_Model.csv', index = False, header=True)