### Uber Ride Location Data 
**Apr - Sep 2014**
[(Source)](https://data.world/data-society/uber-pickups-in-nyc)

In [2]:
# Import module
import pandas as pd
import numpy as np

In [62]:
# Load datasets
apr = pd.read_csv('./uber_data/uber-raw-data-apr14.csv')
may = pd.read_csv('./uber_data/uber-raw-data-may14.csv')
jun = pd.read_csv('./uber_data/uber-raw-data-jun14.csv')
jul = pd.read_csv('./uber_data/uber-raw-data-jul14.csv')
aug = pd.read_csv('./uber_data/uber-raw-data-aug14.csv')
sep = pd.read_csv('./uber_data/uber-raw-data-sep14.csv')

In [63]:
nyc_geo = pd.read_csv('./clean_data/nyc_geo_latlon.csv') # neighbourhoods
nyc_geo.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,ll,Zipcode,ll_round
0,Bronx,Wakefield,40.894705,-73.847201,"40.89470517661,-73.84720052054902",10466.0,"40.895,-73.847"
1,Bronx,Co-op City,40.874294,-73.829939,"40.87429419303012,-73.82993910812398",10475.0,"40.874,-73.83"
2,Bronx,Eastchester,40.887556,-73.827806,"40.887555677350775,-73.82780644716412",10466.0,"40.888,-73.828"
3,Bronx,Fieldston,40.895437,-73.905643,"40.89543742690383,-73.90564259591682",10471.0,"40.895,-73.906"
4,Bronx,Riverdale,40.890834,-73.912585,"40.890834493891305,-73.9125854610857",10463.0,"40.891,-73.913"


In [64]:
# Calculate total rows
apr.shape[0] + may.shape[0] + jun.shape[0] + jul.shape[0] + aug.shape[0] + sep.shape[0]

4534327

In [65]:
# Join DataFrames together
months = [apr, may, jun, jul, aug, sep]
uber = pd.concat(months)

# Confirm same number of rows
uber.shape

(4534327, 4)

In [66]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date/Time  object 
 1   Lat        float64
 2   Lon        float64
 3   Base       object 
dtypes: float64(2), object(2)
memory usage: 173.0+ MB


In [67]:
# checking for duplicates
uber[uber.duplicated()]

Unnamed: 0,Date/Time,Lat,Lon,Base
44,4/1/2014 5:44:00,40.7430,-74.0301,B02512
128,4/1/2014 7:25:00,40.7805,-73.9481,B02512
185,4/1/2014 8:25:00,40.7620,-73.9787,B02512
190,4/1/2014 8:29:00,40.6904,-74.1778,B02512
238,4/1/2014 9:49:00,40.7195,-74.0367,B02512
...,...,...,...,...
1027993,9/30/2014 22:25:00,40.6484,-73.7829,B02764
1028029,9/30/2014 22:32:00,40.7489,-73.9759,B02764
1028065,9/30/2014 22:39:00,40.7714,-73.9502,B02764
1028095,9/30/2014 22:46:00,40.7277,-73.9893,B02764


In [68]:
uber.drop_duplicates(inplace=True)

In [69]:
# Round lat and longitude columns
uber['Lat'] = round(uber['Lat'], 3)
uber['Lon'] = round(uber['Lon'], 3)
uber['LatLon'] = uber['Lat'].astype(str) + ',' + uber['Lon'].astype(str)

In [70]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4451746 entries, 0 to 1028135
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date/Time  object 
 1   Lat        float64
 2   Lon        float64
 3   Base       object 
 4   LatLon     object 
dtypes: float64(2), object(3)
memory usage: 203.8+ MB


In [90]:
# taking a sample based off of time
uber_sample = uber.groupby(['Date/Time']).sample(frac=.25, random_state=1)

In [91]:
# assigning correct Borough & Neighbourhood labels
num_rows = nyc_geo.shape[0]
row_count = 0

while row_count < num_rows:
    #uber_sample.loc[uber_sample['Zipcode'] ==nyc_geo['Zipcode'][row_count], 'Neighbourhood'] = nyc_geo['Neighbourhood'][row_count]
    uber_sample.loc[uber_sample['LatLon'] ==nyc_geo['ll_round'][row_count], 'Neighbourhood'] = nyc_geo['Neighbourhood'][row_count]
    #uber_sample.loc[uber_sample['Zipcode'] ==nyc_geo['Zipcode'][row_count], 'Borough'] = nyc_geo['Borough'][row_count]
    uber_sample.loc[uber_sample['LatLon'] ==nyc_geo['ll_round'][row_count], 'Borough'] = nyc_geo['Borough'][row_count]
    row_count +=1

In [94]:
# taking another sample based off of location
uber_sample = uber_sample.groupby(['Neighbourhood']).sample(frac=.25, random_state=1)

In [95]:
len(uber_sample['Neighbourhood'].unique())

83

In [96]:
uber_sample.drop(columns=['Lat','Lon','Base','LatLon'], inplace=True)
uber_sample.head()

Unnamed: 0,Date/Time,Neighbourhood,Borough
537109,7/27/2014 14:45:00,Astoria,Queens
618312,9/27/2014 21:38:00,Astoria,Queens
334271,5/10/2014 20:38:00,Battery Park City,Manhattan
492538,5/10/2014 11:48:00,Battery Park City,Manhattan
606179,9/26/2014 22:42:00,Battery Park City,Manhattan


In [97]:
# converting date to datetime object
uber_sample['Date/Time'] = pd.to_datetime(uber_sample['Date/Time'])

In [98]:
# all of our data is from 2014 so not creating a years column
uber_sample['Month'] = uber_sample['Date/Time'].dt.strftime('%m')
uber_sample['Day'] = uber_sample['Date/Time'].dt.strftime('%d')

In [99]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2372 entries, 537109 to 46432
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date/Time      2372 non-null   datetime64[ns]
 1   Neighbourhood  2372 non-null   object        
 2   Borough        2372 non-null   object        
 3   Month          2372 non-null   object        
 4   Day            2372 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 111.2+ KB


In [100]:
# dropping null values and date/time column
uber_sample.drop(columns=['Date/Time'], inplace=True)
uber_sample.dropna(inplace=True)

In [101]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2372 entries, 537109 to 46432
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Neighbourhood  2372 non-null   object
 1   Borough        2372 non-null   object
 2   Month          2372 non-null   object
 3   Day            2372 non-null   object
dtypes: object(4)
memory usage: 92.7+ KB


In [102]:
# making a new variable that just counts every call as a single ride
uber_sample['Ride Count'] = 1

In [103]:
borough_ubers = uber_sample.groupby(['Borough','Month']).sum()

In [104]:
neighbourhood_ubers = uber_sample.groupby(['Borough', 'Neighbourhood','Month']).sum()

In [105]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Ride Count
Borough,Month,Unnamed: 2_level_1
Bronx,4,2
Bronx,7,2
Brooklyn,4,23
Brooklyn,5,33
Brooklyn,6,27
Brooklyn,7,47
Brooklyn,8,59
Brooklyn,9,68
Manhattan,4,249
Manhattan,5,303


In [107]:
apr_days = 30
may_days = 31
jun_days = 30
jul_days = 31
aug_days = 31
sep_days = 30

In [108]:
# creating an array with the number of days per month
month_days = np.array([30, 31, 30, 31, 31, 30])

In [109]:
borough_ubers['Days/Mo'] = np.resize(month_days, borough_ubers.shape[0])

In [110]:
borough_ubers['Avg Rides/Day'] = (borough_ubers['Ride Count'] / borough_ubers['Days/Mo'])

In [111]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,4,2,30,0.066667
Bronx,7,2,31,0.064516
Brooklyn,4,23,30,0.766667
Brooklyn,5,33,31,1.064516
Brooklyn,6,27,31,0.870968
Brooklyn,7,47,30,1.566667
Brooklyn,8,59,30,1.966667
Brooklyn,9,68,31,2.193548
Manhattan,4,249,30,8.3
Manhattan,5,303,31,9.774194


In [112]:
neighbourhood_ubers['Days/Mo'] = np.resize(month_days, neighbourhood_ubers.shape[0])
neighbourhood_ubers['Avg Rides/Day'] = (neighbourhood_ubers['Ride Count'] / neighbourhood_ubers['Days/Mo'])

In [113]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,4,2,30,0.066667
Bronx,7,2,31,0.064516
Brooklyn,4,23,30,0.766667
Brooklyn,5,33,31,1.064516
Brooklyn,6,27,31,0.870968
Brooklyn,7,47,30,1.566667
Brooklyn,8,59,30,1.966667
Brooklyn,9,68,31,2.193548
Manhattan,4,249,30,8.3
Manhattan,5,303,31,9.774194


In [114]:
neighbourhood_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Neighbourhood,Month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,Belmont,07,2,30,0.066667
Bronx,Mott Haven,04,1,31,0.032258
Bronx,Parkchester,04,1,30,0.033333
Brooklyn,Boerum Hill,04,3,31,0.096774
Brooklyn,Boerum Hill,05,1,31,0.032258
...,...,...,...,...,...
Queens,Sunnyside Gardens,04,1,31,0.032258
Queens,Sunnyside Gardens,06,1,30,0.033333
Queens,Sunnyside Gardens,07,2,31,0.064516
Queens,Woodside,09,1,31,0.032258


In [115]:
uber_sample

Unnamed: 0,Neighbourhood,Borough,Month,Day,Ride Count
537109,Astoria,Queens,07,27,1
618312,Astoria,Queens,09,27,1
334271,Battery Park City,Manhattan,05,10,1
492538,Battery Park City,Manhattan,05,10,1
606179,Battery Park City,Manhattan,09,26,1
...,...,...,...,...,...
860623,Yorkville,Manhattan,09,04,1
713248,Yorkville,Manhattan,08,19,1
591598,Yorkville,Manhattan,05,23,1
370908,Yorkville,Manhattan,06,18,1


In [116]:
# exporting csv
uber_sample.to_csv('uber_sample.csv', index=False)
borough_ubers.to_csv('uber_boroughs.csv', index=True)
neighbourhood_ubers.to_csv('uber_neighbourhoods.csv', index=True)