### Uber Ride Location Data 
**Apr - Sep 2014**
[(Source)](https://data.world/data-society/uber-pickups-in-nyc)

In [374]:
# Import module
import pandas as pd
import numpy as np

In [375]:
# Load datasets
apr = pd.read_csv('./uber_data/uber-raw-data-apr14.csv')
may = pd.read_csv('./uber_data/uber-raw-data-may14.csv')
jun = pd.read_csv('./uber_data/uber-raw-data-jun14.csv')
jul = pd.read_csv('./uber_data/uber-raw-data-jul14.csv')
aug = pd.read_csv('./uber_data/uber-raw-data-aug14.csv')
sep = pd.read_csv('./uber_data/uber-raw-data-sep14.csv')

In [147]:
# importing borough & neighbourhood dataframes
nyc_bo = pd.read_csv('./clean_data/nyc_boroughs.csv') # boroughs
nyc_nb = pd.read_csv('./clean_data/nyc_neighbs.csv',encoding='cp1252') # neighbourhoods

In [148]:
# converting numbers to strings so we can search using them as our substring value
nyc_bo = nyc_bo.astype(str)
# removing trailing '.0'
nyc_bo = nyc_bo.stack().str.replace(r'\.0','').unstack()
# converting numbers to strings so we can search using them as our substring value
nyc_nb = nyc_nb.astype(str)
# removing trailing '.0'
nyc_nb = nyc_nb.stack().str.replace(r'\.0','').unstack()

  nyc_bo = nyc_bo.stack().str.replace(r'\.0','').unstack()
  nyc_nb = nyc_nb.stack().str.replace(r'\.0','').unstack()


In [376]:
# Calculate total rows
apr.shape[0] + may.shape[0] + jun.shape[0] + jul.shape[0] + aug.shape[0] + sep.shape[0]

4534327

In [377]:
# Join DataFrames together
months = [apr, may, jun, jul, aug, sep]
uber = pd.concat(months)

# Confirm same number of rows
uber.shape

(4534327, 4)

In [378]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date/Time  object 
 1   Lat        float64
 2   Lon        float64
 3   Base       object 
dtypes: float64(2), object(2)
memory usage: 173.0+ MB


In [379]:
# checking for duplicates
uber[uber.duplicated()]

Unnamed: 0,Date/Time,Lat,Lon,Base
44,4/1/2014 5:44:00,40.7430,-74.0301,B02512
128,4/1/2014 7:25:00,40.7805,-73.9481,B02512
185,4/1/2014 8:25:00,40.7620,-73.9787,B02512
190,4/1/2014 8:29:00,40.6904,-74.1778,B02512
238,4/1/2014 9:49:00,40.7195,-74.0367,B02512
...,...,...,...,...
1027993,9/30/2014 22:25:00,40.6484,-73.7829,B02764
1028029,9/30/2014 22:32:00,40.7489,-73.9759,B02764
1028065,9/30/2014 22:39:00,40.7714,-73.9502,B02764
1028095,9/30/2014 22:46:00,40.7277,-73.9893,B02764


In [380]:
uber.drop_duplicates(inplace=True)

In [381]:
# Round lat and longitude columns
uber['Lat'] = round(uber['Lat'], 3)
uber['Lon'] = round(uber['Lon'], 3)
uber['LatLon'] = uber['Lat'].astype(str) + ',' + uber['Lon'].astype(str)

In [382]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4451746 entries, 0 to 1028135
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date/Time  object 
 1   Lat        float64
 2   Lon        float64
 3   Base       object 
 4   LatLon     object 
dtypes: float64(2), object(3)
memory usage: 203.8+ MB


In [383]:
# taking a sample based off of time
uber_sample = uber.groupby(['Date/Time']).sample(frac=.05, random_state=1)

In [384]:
# taking another sample based off of location
uber_sample = uber_sample.groupby(['LatLon']).sample(frac=.1, random_state=1)

In [385]:
uber_sample.groupby(['LatLon']).count()

Unnamed: 0_level_0,Date/Time,Lat,Lon,Base
LatLon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"40.575,-73.984",1,1,1,1
"40.576,-73.981",1,1,1,1
"40.617,-74.021",1,1,1,1
"40.618,-74.03",1,1,1,1
"40.622,-74.032",1,1,1,1
...,...,...,...,...
"40.89,-73.906",2,2,2,2
"40.893,-73.9",3,3,3,3
"40.9,-73.901",2,2,2,2
"41.023,-73.713",1,1,1,1


In [386]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19499 entries, 118400 to 222681
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date/Time  19499 non-null  object 
 1   Lat        19499 non-null  float64
 2   Lon        19499 non-null  float64
 3   Base       19499 non-null  object 
 4   LatLon     19499 non-null  object 
dtypes: float64(2), object(3)
memory usage: 914.0+ KB


In [387]:
latlon = uber_sample.groupby(['LatLon']).count()

In [388]:
latlon.reset_index(drop = False, inplace=True)

In [389]:
latlon

Unnamed: 0,LatLon,Date/Time,Lat,Lon,Base
0,"40.575,-73.984",1,1,1,1
1,"40.576,-73.981",1,1,1,1
2,"40.617,-74.021",1,1,1,1
3,"40.618,-74.03",1,1,1,1
4,"40.622,-74.032",1,1,1,1
...,...,...,...,...,...
4793,"40.89,-73.906",2,2,2,2
4794,"40.893,-73.9",3,3,3,3
4795,"40.9,-73.901",2,2,2,2
4796,"41.023,-73.713",1,1,1,1


In [390]:
latlon = latlon['LatLon'].str.split(',', expand=True)

In [391]:
latlon.rename(columns={0:'Latitude', 1:'Longitude'}, inplace=True)

In [392]:
latlon

Unnamed: 0,Latitude,Longitude
0,40.575,-73.984
1,40.576,-73.981
2,40.617,-74.021
3,40.618,-74.03
4,40.622,-74.032
...,...,...
4793,40.89,-73.906
4794,40.893,-73.9
4795,40.9,-73.901
4796,41.023,-73.713


In [166]:
# saving to csv
latlon.to_csv('uber_latlon.csv', index=False)

In [167]:
zipcodes = pd.read_csv('uber_addresses.csv')

In [393]:
zipcodes

Unnamed: 0,Latitude,Longitude,Zipcode,LatLon,Borough,Neighbourhood
0,40.575,-73.984,11224,"40.575,-73.984",Brooklyn,Southern Brooklyn
1,40.576,-73.981,11224,"40.576,-73.981",Brooklyn,Southern Brooklyn
2,40.617,-74.021,11228,"40.617,-74.021",Brooklyn,Southwest Brooklyn
3,40.618,-74.030,11209,"40.618,-74.03",Brooklyn,Southwest Brooklyn
4,40.622,-74.032,11209,"40.622,-74.032",Brooklyn,Southwest Brooklyn
...,...,...,...,...,...,...
4793,40.890,-73.906,10471,"40.89,-73.906",Bronx,Kingsbridge and Riverdale
4794,40.893,-73.900,10471,"40.893,-73.9",Bronx,Kingsbridge and Riverdale
4795,40.900,-73.901,11201,"40.9,-73.901",Brooklyn,Northwest Brooklyn
4796,41.023,-73.713,0,"41.023,-73.713",,


In [394]:
zipcodes

Unnamed: 0,Latitude,Longitude,Zipcode,LatLon,Borough,Neighbourhood
0,40.575,-73.984,11224,"40.575,-73.984",Brooklyn,Southern Brooklyn
1,40.576,-73.981,11224,"40.576,-73.981",Brooklyn,Southern Brooklyn
2,40.617,-74.021,11228,"40.617,-74.021",Brooklyn,Southwest Brooklyn
3,40.618,-74.030,11209,"40.618,-74.03",Brooklyn,Southwest Brooklyn
4,40.622,-74.032,11209,"40.622,-74.032",Brooklyn,Southwest Brooklyn
...,...,...,...,...,...,...
4793,40.890,-73.906,10471,"40.89,-73.906",Bronx,Kingsbridge and Riverdale
4794,40.893,-73.900,10471,"40.893,-73.9",Bronx,Kingsbridge and Riverdale
4795,40.900,-73.901,11201,"40.9,-73.901",Brooklyn,Northwest Brooklyn
4796,41.023,-73.713,0,"41.023,-73.713",,


In [395]:
zipcodes['Zipcode'].fillna(0, inplace=True)

In [396]:
pd.to_numeric(zipcodes['Zipcode'], downcast="float")

ValueError: Unable to parse string "FK14 7DG" at position 950

In [398]:
# running into a lot of floats, so just going to change our function to look for instances where a string contains our zipcode

In [399]:
zipcodes['Zipcode'] = zipcodes['Zipcode'].astype(str)

In [400]:
nyc_bo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 453 entries, 0 to 452
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Manhattan      453 non-null    object
 1   Staten Island  453 non-null    object
 2   Bronx          453 non-null    object
 3   Queens         453 non-null    object
 4   Brooklyn       453 non-null    object
dtypes: object(5)
memory usage: 37.4+ KB


In [401]:
# assigning neighbourhoods & boroughs to every zipcode

In [402]:
# creating lists of borough & neighbourhood names
bo_list = nyc_bo.columns.tolist()
nb_list = nyc_nb.columns.tolist()

In [403]:
# want to make sure we have consistent borough & neighbourhood info
def assign_borough(borough):
    """ Finds borough name for address dataframe.
    
    Loops through the nyc_bo (nyc boroughs) dataframe to see if one of its zipcodes 
    is within the address string of our addresses dataframe. Then fills in borough name
    
    Params:
        borough: desired borough (column from nyc_bo dataframe)
        
    Returns:
        fills 'Borough' value with column name from nyc_bo
    
    """
    num_rows = nyc_bo.shape[0]
    row_count = 0

    while row_count < num_rows:
        zipcodes.loc[zipcodes['Zipcode'].str.contains(nyc_bo[borough][row_count]), 'Borough'] = borough        
        row_count +=1

In [404]:
# loop to run borough assignment
"""
iterates through list of borough names & inserts them runs assign_borough function every value
"""
for i in range(len(bo_list)):
    assign_borough(bo_list[i])

In [405]:
# want to make sure we have consistent borough & neighbourhood info
def assign_neighbourhood(neighbourhood):
    """ Finds neighbourhood name for address dataframe.
    
    Loops through the nyc_bo (nyc boroughs) dataframe to see if one of its zipcodes 
    is within the address string of our addresses dataframe. Then fills in borough name
    
    Params:
        borough: desired neighbourhood (column from nyc_bo dataframe)
        
    Returns:
        fills 'Neighbourhood' value with column name from nyc_nb
    
    """
    num_rows = nyc_nb.shape[0]
    row_count = 0

    while row_count < num_rows:
        zipcodes.loc[zipcodes['Zipcode'].str.contains(nyc_nb[neighbourhood][row_count]), 'Neighbourhood'] = neighbourhood  
                        
        row_count +=1

In [406]:
# loop to run neighbourhood assignment
"""
iterates through list of neighbourhood names & inserts them runs assign_borough function every value
"""
for i in range(len(nb_list)):
    assign_neighbourhood(nb_list[i])

In [407]:
zipcodes['LatLon'] = zipcodes['Latitude'].astype(str) + ',' + zipcodes['Longitude'].astype(str)

In [408]:
zipcodes.head()

Unnamed: 0,Latitude,Longitude,Zipcode,LatLon,Borough,Neighbourhood
0,40.575,-73.984,11224,"40.575,-73.984",Brooklyn,Southern Brooklyn
1,40.576,-73.981,11224,"40.576,-73.981",Brooklyn,Southern Brooklyn
2,40.617,-74.021,11228,"40.617,-74.021",Brooklyn,Southwest Brooklyn
3,40.618,-74.03,11209,"40.618,-74.03",Brooklyn,Southwest Brooklyn
4,40.622,-74.032,11209,"40.622,-74.032",Brooklyn,Southwest Brooklyn


In [409]:
# time to fill in the missing geographic locations in our uber data!

In [410]:
# trying to see if i can do this without a loop

In [411]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19499 entries, 118400 to 222681
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date/Time  19499 non-null  object 
 1   Lat        19499 non-null  float64
 2   Lon        19499 non-null  float64
 3   Base       19499 non-null  object 
 4   LatLon     19499 non-null  object 
dtypes: float64(2), object(3)
memory usage: 914.0+ KB


In [412]:
# doing a left merge on 'LatLon'
uber_sample = uber_sample.merge(zipcodes, how='left', on='LatLon')

In [413]:
uber_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,LatLon,Latitude,Longitude,Zipcode,Borough,Neighbourhood
0,7/12/2014 19:27:00,40.575,-73.984,B02598,"40.575,-73.984",40.575,-73.984,11224,Brooklyn,Southern Brooklyn
1,9/7/2014 12:37:00,40.576,-73.981,B02617,"40.576,-73.981",40.576,-73.981,11224,Brooklyn,Southern Brooklyn
2,8/28/2014 15:20:00,40.617,-74.021,B02598,"40.617,-74.021",40.617,-74.021,11228,Brooklyn,Southwest Brooklyn
3,9/26/2014 11:06:00,40.618,-74.03,B02598,"40.618,-74.03",40.618,-74.03,11209,Brooklyn,Southwest Brooklyn
4,4/12/2014 1:13:00,40.622,-74.032,B02682,"40.622,-74.032",40.622,-74.032,11209,Brooklyn,Southwest Brooklyn


In [414]:
uber_sample.drop(columns=['Lat','Lon','Base','LatLon'], inplace=True)
uber_sample.head()

Unnamed: 0,Date/Time,Latitude,Longitude,Zipcode,Borough,Neighbourhood
0,7/12/2014 19:27:00,40.575,-73.984,11224,Brooklyn,Southern Brooklyn
1,9/7/2014 12:37:00,40.576,-73.981,11224,Brooklyn,Southern Brooklyn
2,8/28/2014 15:20:00,40.617,-74.021,11228,Brooklyn,Southwest Brooklyn
3,9/26/2014 11:06:00,40.618,-74.03,11209,Brooklyn,Southwest Brooklyn
4,4/12/2014 1:13:00,40.622,-74.032,11209,Brooklyn,Southwest Brooklyn


In [415]:
# converting date to datetime object
uber_sample['Date/Time'] = pd.to_datetime(uber_sample['Date/Time'])

In [416]:
# all of our data is from 2014 so not creating a years column
uber_sample['Month'] = uber_sample['Date/Time'].dt.strftime('%m')
uber_sample['Day'] = uber_sample['Date/Time'].dt.strftime('%d')

In [417]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19499 entries, 0 to 19498
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date/Time      19499 non-null  datetime64[ns]
 1   Latitude       19499 non-null  float64       
 2   Longitude      19499 non-null  float64       
 3   Zipcode        19499 non-null  object        
 4   Borough        19185 non-null  object        
 5   Neighbourhood  17581 non-null  object        
 6   Month          19499 non-null  object        
 7   Day            19499 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(5)
memory usage: 1.3+ MB


In [418]:
# dropping null values and date/time column
uber_sample.drop(columns=['Date/Time'], inplace=True)
uber_sample.dropna(inplace=True)

In [419]:
uber_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17581 entries, 0 to 19495
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Latitude       17581 non-null  float64
 1   Longitude      17581 non-null  float64
 2   Zipcode        17581 non-null  object 
 3   Borough        17581 non-null  object 
 4   Neighbourhood  17581 non-null  object 
 5   Month          17581 non-null  object 
 6   Day            17581 non-null  object 
dtypes: float64(2), object(5)
memory usage: 1.1+ MB


In [420]:
# making a new variable that just counts every call as a single ride
uber_sample['Ride Count'] = 1

In [421]:
borough_ubers = uber_sample.groupby(['Borough','Month']).sum()

In [422]:
neighbourhood_ubers = uber_sample.groupby(['Borough', 'Neighbourhood','Month']).sum()

In [423]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Latitude,Longitude,Ride Count
Borough,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,4,40.828,-73.926,1
Bronx,5,245.07,-443.479,6
Bronx,6,204.269,-369.589,5
Bronx,7,285.906,-517.379,7
Bronx,8,122.488,-221.78,3
Bronx,9,490.207,-886.997,12
Brooklyn,4,7529.351,-13684.142,185
Brooklyn,5,8261.208,-15016.151,203
Brooklyn,6,9685.21,-17604.744,238
Brooklyn,7,13958.555,-25371.346,343


In [424]:
# these are just counts, so every value is the same. just chose Latitude because it's first
borough_ubers.drop(columns=['Latitude','Longitude'], inplace=True)
neighbourhood_ubers.drop(columns=['Latitude','Longitude'], inplace=True)

In [425]:
apr_days = 30
may_days = 31
jun_days = 30
jul_days = 31
aug_days = 31
sep_days = 30

In [426]:
# creating an array with the number of days per month
month_days = np.array([30, 31, 30, 31, 31, 30])

In [427]:
borough_ubers['Days/Mo'] = np.resize(month_days, borough_ubers.shape[0])

In [428]:
borough_ubers['Avg Rides/Day'] = (borough_ubers['Ride Count'] / borough_ubers['Days/Mo'])

In [429]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,4,1,30,0.033333
Bronx,5,6,31,0.193548
Bronx,6,5,30,0.166667
Bronx,7,7,31,0.225806
Bronx,8,3,31,0.096774
Bronx,9,12,30,0.4
Brooklyn,4,185,30,6.166667
Brooklyn,5,203,31,6.548387
Brooklyn,6,238,30,7.933333
Brooklyn,7,343,31,11.064516


In [430]:
neighbourhood_ubers['Days/Mo'] = np.resize(month_days, neighbourhood_ubers.shape[0])
neighbourhood_ubers['Avg Rides/Day'] = (neighbourhood_ubers['Ride Count'] / neighbourhood_ubers['Days/Mo'])

In [433]:
borough_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bronx,4,1,30,0.033333
Bronx,5,6,31,0.193548
Bronx,6,5,30,0.166667
Bronx,7,7,31,0.225806
Bronx,8,3,31,0.096774
Bronx,9,12,30,0.4
Brooklyn,4,185,30,6.166667
Brooklyn,5,203,31,6.548387
Brooklyn,6,238,30,7.933333
Brooklyn,7,343,31,11.064516


In [431]:
neighbourhood_ubers

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ride Count,Days/Mo,Avg Rides/Day
Borough,Neighbourhood,Month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,Bronx Park and Fordham,07,1,30,0.033333
Bronx,Bronx Park and Fordham,09,1,31,0.032258
Bronx,Crotona and Tremont,07,1,30,0.033333
Bronx,HighBridge and Morrisania,04,1,31,0.032258
Bronx,HighBridge and Morrisania,05,3,31,0.096774
...,...,...,...,...,...
Staten Island,Stapleton and St. George,05,2,31,0.064516
Staten Island,Stapleton and St. George,06,2,30,0.066667
Staten Island,Stapleton and St. George,07,2,31,0.064516
Staten Island,Stapleton and St. George,08,5,31,0.161290


In [432]:
uber_sample

Unnamed: 0,Latitude,Longitude,Zipcode,Borough,Neighbourhood,Month,Day,Ride Count
0,40.575,-73.984,11224,Brooklyn,Southern Brooklyn,07,12,1
1,40.576,-73.981,11224,Brooklyn,Southern Brooklyn,09,07,1
2,40.617,-74.021,11228,Brooklyn,Southwest Brooklyn,08,28,1
3,40.618,-74.030,11209,Brooklyn,Southwest Brooklyn,09,26,1
4,40.622,-74.032,11209,Brooklyn,Southwest Brooklyn,04,12,1
...,...,...,...,...,...,...,...,...
19491,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,07,01,1
19492,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,05,15,1
19493,40.893,-73.900,10471,Bronx,Kingsbridge and Riverdale,09,08,1
19494,40.900,-73.901,11201,Brooklyn,Northwest Brooklyn,04,22,1


In [436]:
# exporting csv
uber_sample.to_csv('uber_sample.csv', index=False)
borough_ubers.to_csv('uber_boroughs.csv', index=True)
neighbourhood_ubers.to_csv('uber_neighbourhoods.csv', index=True)