QUICK NOTES: convert month, day, hour, etc. to object dtype??

 # This notebook is to join both datasets: Taxis and Weather

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import date
import matplotlib.pyplot as plt
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
# Let´s start by joining just one year: 2017

### Get Manhatan Zones DataFrame

In [17]:
# 1. Import Location and Borough columns form NY TAXI ZONES dataset
dfzones = pd.read_csv('../data/NY_taxi_zones.csv', sep=',',
                      usecols=['LocationID', 'borough'])

dfzones.rename(columns={'LocationID': 'PULocationID'}, inplace=True)

# 2. Filter Manhattan zones
dfzones = dfzones[dfzones['borough']=='Manhattan']\
                .drop(['borough'], axis=1)\
                .sort_values(by='PULocationID')\
                .drop_duplicates('PULocationID').reset_index(drop=True)

#dfzones.PULocationID = dfzones.PULocationID.astype(str)

dfzones = pd.concat([dfzones]*8760).reset_index(drop=True)
print(dfzones.shape)
dfzones.head()
#dfManZon_array = dfManZon.iloc[:,0].values
#dfManZon_array
# There are 67 zones in Manhattan

(586920, 1)


Unnamed: 0,PULocationID
0,4
1,12
2,13
3,24
4,41


### Create Data Frame index and Attach Zones

In [18]:
a = pd.period_range(start='2017-01-01', end='2017-12-31T23:00', freq='H')
df_index = pd.DataFrame({'datetime':a})

df_index['month'] = df_index['datetime'].dt.month
df_index['day'] = df_index['datetime'].dt.day
df_index['hour'] = df_index['datetime'].dt.hour
df_index = df_index.drop(columns=['datetime'],inplace=False)
df_index = df_index.iloc[np.arange(len(df_index)).repeat(67)].reset_index(drop=True)
df_index['LocationID'] = dfzones['PULocationID']
print(df_index.shape)
df_index.head(10)

(586920, 4)


Unnamed: 0,month,day,hour,LocationID
0,1,1,0,4
1,1,1,0,12
2,1,1,0,13
3,1,1,0,24
4,1,1,0,41
5,1,1,0,42
6,1,1,0,43
7,1,1,0,45
8,1,1,0,48
9,1,1,0,50


## Taxis Dataset: Import & Sanity check:
- Confirm that the year is correct.
- Confirm that hourly periods count is correct

In [19]:
year = 2017
dftax = pd.read_csv('../data/Data_Taxis_'+str(year)+'_Cleaned.csv', sep=',',
                        #dtype = {"PULocationID" : "object"},
                        parse_dates={'datetime':['pickup_datetime']})
print('Year should be unique: ', dftax.year.unique())
dftax = dftax.rename(columns={'PULocationID':'LocationID'}) 
dftax['hourlyperiods'] = 1
h = dftax.groupby(['month','day','hour'])['hourlyperiods'].sum()
print('There should be 8760 hourly periods in a year: ', h.shape[0])
print(dftax.shape)
dftax.head()

Year should be unique:  [2017]
There should be 8760 hourly periods in a year:  8760
(416922, 12)


Unnamed: 0,datetime,LocationID,NoOfPickups,year,month,day,hour,week,dayofweek,isweekend,IsHoliday,hourlyperiods
0,2017-01-01,12,3,2017,1,1,0,52,6,1,0,1
1,2017-01-01,13,103,2017,1,1,0,52,6,1,0,1
2,2017-01-01,40,10,2017,1,1,0,52,6,1,0,1
3,2017-01-01,41,136,2017,1,1,0,52,6,1,0,1
4,2017-01-01,43,401,2017,1,1,0,52,6,1,0,1


### Taxis  Prepare data frame for JOIN

In [20]:
dftax = dftax.drop(columns=['datetime', 'year', 'hourlyperiods'],inplace=False)

dftax.head()

Unnamed: 0,LocationID,NoOfPickups,month,day,hour,week,dayofweek,isweekend,IsHoliday
0,12,3,1,1,0,52,6,1,0
1,13,103,1,1,0,52,6,1,0
2,40,10,1,1,0,52,6,1,0
3,41,136,1,1,0,52,6,1,0
4,43,401,1,1,0,52,6,1,0


### Check that both indexes are the same

In [31]:
tax_m = dftax.groupby(['month']).count()
tax_d = dftax.groupby(['month', 'day']).count()
tax_h = dftax.groupby(['month', 'day','hour']).count()
ind_m = df_index.groupby(['month']).count()
ind_d = df_index.groupby(['month','day']).count()
ind_h = df_index.groupby(['month','day','hour']).count()
ind_z = df_index.groupby(['month','day','hour','LocationID']).count()


print('12 MONTHS:',tax_m.shape[0], ind_m.shape[0])
print('365 DAYS:',tax_d.shape[0], ind_d.shape[0])
print('8760 HOURS:',tax_h.shape[0], ind_h.shape[0])
print('LocationID:', ind_z.shape[0])

12 MONTHS: 12 12
365 DAYS: 365 365
8760 HOURS: 8760 8760
LocationID: 586920


In [35]:
dftax_g = dftax.groupby(['month','day','hour','LocationID']).sum()
df_index_g = df_index.groupby(['month','day','hour','LocationID']).sum()
print(dftax_g.shape)
print(df_index_g.shape)

(416922, 5)
(586920, 0)


In [39]:
print(dftax_g.shape)
display(dftax_g.head())
print(df_index_g.shape)
display(df_index_g.head())

(416922, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NoOfPickups,week,dayofweek,isweekend,IsHoliday
month,day,hour,LocationID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,12,3,52,6,1,0
1,1,0,13,103,52,6,1,0
1,1,0,40,10,52,6,1,0
1,1,0,41,136,52,6,1,0
1,1,0,43,401,52,6,1,0


(586920, 0)


month,day,hour,LocationID
1,1,0,4
1,1,0,12
1,1,0,13
1,1,0,24
1,1,0,41


In [40]:
taxis_d_final = dftax_g.join(df_index_g, how='outer').reset_index()

In [50]:
pd.unique(taxis_d_final['LocationID']).shape

(93,)

In [46]:
taxis_d_final_g = taxis_d_final.groupby(['month','day','hour','LocationID']).count()
taxis_d_final_g.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,NoOfPickups,week,dayofweek,isweekend,IsHoliday
month,day,hour,LocationID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,4,0,0,0,0,0
1,1,0,12,1,1,1,1,1
1,1,0,13,1,1,1,1,1
1,1,0,24,0,0,0,0,0
1,1,0,40,1,1,1,1,1


In [43]:
print(taxis_d_final.shape)
taxis_d_final.head()

(671772, 9)


Unnamed: 0,month,day,hour,LocationID,NoOfPickups,week,dayofweek,isweekend,IsHoliday
0,1,1,0,4,,,,,
1,1,1,0,12,3.0,52.0,6.0,1.0,0.0
2,1,1,0,13,103.0,52.0,6.0,1.0,0.0
3,1,1,0,24,,,,,
4,1,1,0,40,10.0,52.0,6.0,1.0,0.0


In [40]:
taxis_d_final.shape

(671772, 9)

In [1]:
taxis_d_final = taxis_d_final.groupby(['month','day','hour','LocationID']).sum()
taxis_d_final.head()

NameError: name 'taxis_d_final' is not defined

## Taxis Dataset: Resample LocationIDs
#### For each hour there should be as many rows as LocationIDs

Looking at the Taxis Dataset the hourly periods are missing some LocationIDs

In [3]:
test = dftax.groupby(['hour', 'PULocationID']).sum()
test['countLoc']=1
test.groupby(['hour'])['countLoc'].sum()
# There should be 67 LocationIDs per hour so I need to resample.

hour
0     66
1     65
2     66
3     66
4     65
5     65
6     66
7     64
8     64
9     66
10    65
11    65
12    64
13    65
14    65
15    66
16    65
17    65
18    65
19    66
20    65
21    65
22    65
23    65
Name: countLoc, dtype: int64

## Weather Dataset: Import one year and Sanity check

In [21]:
# Filter one year data

In [4]:
# Import WEATHER DATASET to dataframe.
dfwea = pd.read_csv('../data/Data_Weather_Cleaned.csv', sep=',',
                        parse_dates={'datetime':['DATE']})

# Filter one year data
dfwea.drop(dfwea[dfwea['datetime'] < pd.Timestamp(date(year,1,1))].index, inplace=True)
dfwea.drop(dfwea[dfwea['datetime'] >= pd.Timestamp(date(year+1,1,1))].index, inplace=True)

# Sanity check
print('Year should be unique: ', dfwea.datetime.dt.year.unique())
print('There should be 8760 hourly periods in a year: ', dfwea.shape[0])

dfwea.head()

Year should be unique:  [2017]
There should be 8760 hourly periods in a year:  8760


Unnamed: 0,datetime,HourlyPrecipitation
0,2017-01-01 00:00:00,0.0
1,2017-01-01 01:00:00,0.0
2,2017-01-01 02:00:00,0.0
3,2017-01-01 03:00:00,0.0
4,2017-01-01 04:00:00,0.0


## Weather vs Taxis Danity Check

In [5]:
tyear = dftax.year.unique()
wyear = dfwea.datetime.dt.year.unique()
print('Taxis year ({0}) equals Weather year({1})? {2}'.format(tyear,wyear,tyear==wyear))
print('Taxis rows ({0}) equals Weather rows ({1})? {2}'.format(h.shape[0],dfwea.shape[0],h.shape[0]==dfwea.shape[0]))

Taxis year ([2017]) equals Weather year([2017])? [ True]
Taxis rows (8760) equals Weather rows (8760)? True


## Merge Taxis and Weather datasets

In [29]:
dftax.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036854 entries, 0 to 1036853
Data columns (total 12 columns):
pickup_datetime    1036854 non-null datetime64[ns]
PULocationID       1036854 non-null object
NoOfPickups        1036854 non-null int64
year               1036854 non-null int64
month              1036854 non-null int64
day                1036854 non-null int64
hour               1036854 non-null int64
week               1036854 non-null int64
dayofweek          1036854 non-null int64
isweekend          1036854 non-null int64
IsHoliday          1036854 non-null int64
hourlyperiods      1036854 non-null int64
dtypes: datetime64[ns](1), int64(10), object(1)
memory usage: 94.9+ MB


In [30]:
dfwea.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8760 entries, 0 to 8759
Data columns (total 2 columns):
datetime               8760 non-null datetime64[ns]
HourlyPrecipitation    8760 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 205.3 KB


In [41]:
df_merge = pd.merge(dftax, dfwea, on='datetime')
# taxis dataframe and merged dataframe should have same number of rows
print(df_merge.shape[0], dftax.shape[0])
df_merge.head(100)

1036854 1036854


Unnamed: 0,datetime,PULocationID,NoOfPickups,year,month,day,hour,week,dayofweek,isweekend,IsHoliday,hourlyperiods,HourlyPrecipitation
0,2017-01-01,4,136,2017,1,1,0,52,6,1,0,1,0.0
1,2017-01-01,7,77,2017,1,1,0,52,6,1,0,1,0.0
2,2017-01-01,12,3,2017,1,1,0,52,6,1,0,1,0.0
3,2017-01-01,13,103,2017,1,1,0,52,6,1,0,1,0.0
4,2017-01-01,14,3,2017,1,1,0,52,6,1,0,1,0.0
5,2017-01-01,17,16,2017,1,1,0,52,6,1,0,1,0.0
6,2017-01-01,20,2,2017,1,1,0,52,6,1,0,1,0.0
7,2017-01-01,24,94,2017,1,1,0,52,6,1,0,1,0.0
8,2017-01-01,25,31,2017,1,1,0,52,6,1,0,1,0.0
9,2017-01-01,33,31,2017,1,1,0,52,6,1,0,1,0.0


In [44]:
# save csv
df_merge.to_csv('../data/Data_Cleaned_'+str(year)+'_To_Model.csv', index = False, header=True)