### Uber Ride Location Data 
**Apr - Sep 2014**
[(Source)](https://data.world/data-society/uber-pickups-in-nyc)

In [29]:
# Import module
import pandas as pd
import numpy as np

In [15]:
# Load datasets
apr = pd.read_csv('uber_data/uber-raw-data-apr14.csv')
may = pd.read_csv('uber_data/uber-raw-data-may14.csv')
jun = pd.read_csv('uber_data/uber-raw-data-jun14.csv')
jul = pd.read_csv('uber_data/uber-raw-data-jul14.csv')
aug = pd.read_csv('uber_data/uber-raw-data-aug14.csv')
sep = pd.read_csv('uber_data/uber-raw-data-sep14.csv')

In [16]:
# Calculate total rows
apr.shape[0] + may.shape[0] + jun.shape[0] + jul.shape[0] + aug.shape[0] + sep.shape[0]

4534327

In [17]:
# Join DataFrames together
months = [apr, may, jun, jul, aug, sep]
uber = pd.concat(months)

# Confirm same number of rows
uber.shape

(4534327, 4)

In [18]:
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4534327 entries, 0 to 1028135
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Date/Time  object 
 1   Lat        float64
 2   Lon        float64
 3   Base       object 
dtypes: float64(2), object(2)
memory usage: 173.0+ MB


In [19]:
# Drop unused columns
uber.drop(columns=['Date/Time'], inplace=True)

In [20]:
# Round lat and longitude columns
uber['lat'] = round(uber['Lat'], 2)
uber['lon'] = round(uber['Lon'], 2)

# Create coordinates columns with rounded lat and long
uber['coords'] = uber['lat'].astype(str) + ', ' + uber['lon'].astype(str)

In [21]:
# Group number of rows by coordinate
uber_count = uber['coords'].value_counts()

# Create new DataFrame with aggregated values
uber_rides = pd.DataFrame(data=uber_count).reset_index()
uber_rides.columns = ['Coordinates', 'Sum_Rides']

uber_rides['Avg_Rides'] = round(uber_rides['Sum_Rides']/len(months),2)
# uber_rides = uber_rides[['Avg_Rides', 'Sum_Rides', 'Coordinates']].head()

In [22]:
# Validate number of rows in new DataFrame
sum(uber_rides['Sum_Rides'])

4534327

In [23]:
# Check for nulls
uber_rides.isnull().sum()

Coordinates    0
Sum_Rides      0
Avg_Rides      0
dtype: int64

No nulls.

In [24]:
# Check for duplicates
uber_rides[uber_rides.duplicated()]

Unnamed: 0,Coordinates,Sum_Rides,Avg_Rides


No duplicates.

In [25]:
# View variance in numeric features
uber_rides.describe()

Unnamed: 0,Sum_Rides,Avg_Rides
count,5434.0,5434.0
mean,834.436327,139.07316
std,8826.919263,1471.153154
min,1.0,0.17
25%,2.0,0.33
50%,5.0,0.83
75%,20.0,3.33
max,226692.0,37782.0


In [26]:
# Preview cleaned dataset
print(uber_rides.shape)
uber_rides.head()

(5434, 3)


Unnamed: 0,Coordinates,Sum_Rides,Avg_Rides
0,"40.76, -73.98",226692,37782.0
1,"40.76, -73.97",215585,35930.83
2,"40.74, -73.99",192517,32086.17
3,"40.75, -73.99",172188,28698.0
4,"40.72, -74.0",167633,27938.83


In [27]:
uber_rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5434 entries, 0 to 5433
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Coordinates  5434 non-null   object 
 1   Sum_Rides    5434 non-null   int64  
 2   Avg_Rides    5434 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 127.5+ KB


In [28]:
# exporting csv
uber_rides.to_csv('uber_rides.csv', index=False)