In [13]:
import pandas as pd
import numpy as np

In [14]:
# import 30 days Paid_Parking_Occupancy data
# parking rate only contains null, drop the entire column while reading csv file
df = pd.read_csv('data/Paid_Parking_Occupancy__Last_30_Days_.csv')

In [15]:
df.info(null_counts=True)

  df.info(null_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30339022 entries, 0 to 30339021
Data columns (total 12 columns):
 #   Column                    Non-Null Count     Dtype  
---  ------                    --------------     -----  
 0   OccupancyDateTime         30339022 non-null  object 
 1   PaidOccupancy             30339022 non-null  int64  
 2   BlockfaceName             30339022 non-null  object 
 3   SideOfStreet              30339022 non-null  object 
 4   SourceElementKey          30339022 non-null  int64  
 5   ParkingTimeLimitCategory  30298559 non-null  float64
 6   ParkingSpaceCount         30339022 non-null  int64  
 7   PaidParkingArea           30339022 non-null  object 
 8   PaidParkingSubArea        23050744 non-null  object 
 9   PaidParkingRate           0 non-null         float64
 10  ParkingCategory           30339022 non-null  object 
 11  Location                  30339022 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 2.7+ GB


In [16]:
# select data from interested location only
df = df.loc[df['PaidParkingArea'].isin(['South Lake Union', 'Denny Triangle', 'First Hill', 'Capitol Hill'])]

In [21]:
df.shape

(8929308, 12)

In [25]:
df_clean = df.drop(columns='PaidParkingRate', axis = 1)
# drop rows which contain missing values.
df_clean = df_clean.dropna(axis=0)
df_clean.shape

(6107742, 11)

Add availability information

1. Calculate occupancy percentage: The date and time (minute) of the transaction as recorded
2. Aavailability (bool) : True if has available parking spaces. 
3. Available parking spaces (int): Number of availabel parking spaces. 0 if paid spaces are equal or more than the parking space count. A car might leave before the parking expire, thus paird occupancy could be higher than the parking space count. 

In [26]:
df_clean['OccupancyPercentage'] = df_clean['PaidOccupancy']/df_clean['ParkingSpaceCount']
df_clean['HasAvailability'] = df_clean.OccupancyPercentage < 1
df_clean['AvailableSpace'] = np.where(df_clean['ParkingSpaceCount'] - df_clean['PaidOccupancy'] > 0, df_clean['ParkingSpaceCount'] - df_clean['PaidOccupancy'], 0)
df_clean.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'ParkingCategory', 'Location',
       'OccupancyPercentage', 'HasAvailability', 'AvailableSpace'],
      dtype='object')

compute latitude and longitude

In [27]:
location = df_clean.Location.str[7:-1]
df_clean.Location.str.split(' ',expand=True)
df_clean[['Latitude','Longitude']]=location.str.split(' ',expand=True)
df_clean['Latitude'] = pd.to_numeric(df_clean['Latitude'])
df_clean['Longitude'] = pd.to_numeric(df_clean['Longitude'])
df_clean.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'ParkingCategory', 'Location',
       'OccupancyPercentage', 'HasAvailability', 'AvailableSpace', 'Latitude',
       'Longitude'],
      dtype='object')

Compute datetime information. 

1. Convert a string of time into a datetime object
2. Check wheter a given date is a US holiday
3. Compute day of the week, hour of the day and minute of the day
5. sort by datetime

In [28]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import time
df_clean['OccupancyDateTime'] = pd.to_datetime(df_clean['OccupancyDateTime'],infer_datetime_format=True)

df_clean['DayOfTheWeek'] = df_clean['OccupancyDateTime'].dt.day_of_week

df_clean['Hour'] = df_clean['OccupancyDateTime'].dt.hour
minute = df_clean['OccupancyDateTime'].dt.minute

df_clean['MinuteOfTheDay'] = df_clean['Hour'] * 60 * 60 + minute  * 60

In [29]:
df_clean = df_clean.sort_values(by=['OccupancyDateTime'])
df_clean.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'ParkingCategory', 'Location',
       'OccupancyPercentage', 'HasAvailability', 'AvailableSpace', 'Latitude',
       'Longitude', 'DayOfTheWeek', 'MinuteOfTheDay'],
      dtype='object')

In [30]:
df_clean.shape

(6107742, 18)

In [31]:
df_clean.to_csv('data/clean_data.csv', sep=',')

In [34]:
loc_df = df_clean[['Latitude','Longitude', 'BlockfaceName', 'SourceElementKey','SideOfStreet','ParkingSpaceCount','PaidParkingArea']]
loc_df = loc_df.drop_duplicates(subset=['SourceElementKey'], keep='first')
loc_df = loc_df.set_index('SourceElementKey')

In [35]:
loc_df

Unnamed: 0_level_0,Latitude,Longitude,BlockfaceName,SideOfStreet,ParkingSpaceCount,PaidParkingArea
SourceElementKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32910,-122.342210,47.621481,DEXTER AVE N BETWEEN THOMAS ST AND HARRISON ST,E,5,South Lake Union
12682,-122.333446,47.625072,ROY ST BETWEEN FAIRVIEW AVE N AND MINOR AVE N,S,13,South Lake Union
34881,-122.331829,47.621392,PONTIUS AVE N BETWEEN THOMAS ST AND HARRISON ST,W,12,South Lake Union
36141,-122.338324,47.617113,WESTLAKE AVE BETWEEN LENORA ST AND BLANCHARD ST,W,8,Denny Triangle
78073,-122.329282,47.623698,EASTLAKE AVE E BETWEEN REPUBLICAN ST AND MERCE...,W,13,South Lake Union
...,...,...,...,...,...,...
77262,-122.334853,47.618296,BOREN AVE BETWEEN FAIRVIEW AVE AND DENNY WAY,NE,4,Denny Triangle
34489,-122.332569,47.617953,MINOR AVE BETWEEN STEWART ST AND VIRGINIA ST,SW,11,Denny Triangle
13138,-122.335108,47.620784,THOMAS ST BETWEEN BOREN AVE N AND FAIRVIEW AVE N,S,10,South Lake Union
14601,-122.327247,47.630477,E NELSON PL BETWEEN EASTLAKE AVE E AND DEAD END,NE,3,South Lake Union


In [36]:
loc_df.to_csv('data/paystub_location.csv') 

In [37]:
coord_df = loc_df[['Latitude','Longitude','ParkingSpaceCount']]

In [38]:
coord_df.to_csv('data/paystub_coordinates.csv', index=False) 