In [1]:
import pandas as pd
import numpy as np

In [2]:
# import 30 days Paid_Parking_Occupancy data
# parking rate only contains null, drop the entire column while reading csv file
df = pd.read_csv('data/Paid_Parking_Occupancy__Last_30_Days_.csv')

In [3]:
df.head()

Unnamed: 0,OccupancyDateTime,PaidOccupancy,BlockfaceName,SideOfStreet,SourceElementKey,ParkingTimeLimitCategory,ParkingSpaceCount,PaidParkingArea,PaidParkingSubArea,PaidParkingRate,ParkingCategory,Location
0,04/09/2022 06:57:00 PM,1,JEFFERSON ST BETWEEN 4TH AVE AND 5TH AVE,SE,11118,120.0,16,Commercial Core,Financial,,Paid Parking,POINT (-122.32904642 47.6027045)
1,04/09/2022 02:05:00 PM,1,W THOMAS ST BETWEEN 1ST AVE W AND 2ND AVE W,N,23937,240.0,9,Uptown,Edge,,Paid Parking,POINT (-122.35869454 47.62098407)
2,04/09/2022 06:31:00 PM,4,S WELLER ST BETWEEN 5TH AVE S AND 6TH AVE S,S,43942,120.0,5,Chinatown/ID,Edge,,Paid Parking,POINT (-122.32699509 47.59744736)
3,04/09/2022 08:13:00 AM,1,JEFFERSON ST BETWEEN 4TH AVE AND 5TH AVE,SE,11118,120.0,16,Commercial Core,Financial,,Paid Parking,POINT (-122.32904642 47.6027045)
4,04/09/2022 11:37:00 AM,1,7TH AVE BETWEEN JAMES ST AND CHERRY ST,NE,76034,120.0,6,First Hill,,,Restricted Parking Zone,POINT (-122.32722382 47.6050118)


In [4]:
df.info(null_counts=True)

  df.info(null_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30339022 entries, 0 to 30339021
Data columns (total 12 columns):
 #   Column                    Non-Null Count     Dtype  
---  ------                    --------------     -----  
 0   OccupancyDateTime         30339022 non-null  object 
 1   PaidOccupancy             30339022 non-null  int64  
 2   BlockfaceName             30339022 non-null  object 
 3   SideOfStreet              30339022 non-null  object 
 4   SourceElementKey          30339022 non-null  int64  
 5   ParkingTimeLimitCategory  30298559 non-null  float64
 6   ParkingSpaceCount         30339022 non-null  int64  
 7   PaidParkingArea           30339022 non-null  object 
 8   PaidParkingSubArea        23050744 non-null  object 
 9   PaidParkingRate           0 non-null         float64
 10  ParkingCategory           30339022 non-null  object 
 11  Location                  30339022 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 2.7+ GB


In [5]:
# select data from interested location only
df = df.loc[df['PaidParkingArea'].isin(['South Lake Union', 'Denny Triangle', 'First Hill', 'Capitol Hill'])]

In [6]:
df['PaidParkingArea'].value_counts()

South Lake Union    3621061
First Hill          2803196
Denny Triangle      1499665
Capitol Hill        1005386
Name: PaidParkingArea, dtype: int64

In [7]:
df_clean = df.drop(['PaidParkingRate', 'PaidParkingSubArea'], axis = 1)
# drop rows which contain missing values.
df_clean = df_clean.dropna(axis=0)
df_clean.shape

(8910938, 10)

Add availability information

1. Calculate occupancy percentage: out of all provide parking space, what's the percentage of taken spaces
2. Available parking spaces (int): Number of availabel parking spaces. 0 if paid spaces are equal or more than the parking space count. A car might leave before the parking expire, thus paird occupancy could be higher than the parking space count. 
3. AavailablePercentage : out of all provide parking space, what's the percentage of non-taken spaces


In [8]:
df_clean['OccupancyPercentage'] = df_clean['PaidOccupancy']/df_clean['ParkingSpaceCount']
df_clean['AvailableSpace'] = np.where(df_clean['ParkingSpaceCount'] - df_clean['PaidOccupancy'] > 0, df_clean['ParkingSpaceCount'] - df_clean['PaidOccupancy'], 0)
df_clean['AvailablePercentage'] = df_clean['AvailableSpace']/df_clean['ParkingSpaceCount']
df_clean.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'ParkingCategory', 'Location', 'OccupancyPercentage',
       'AvailableSpace', 'AvailablePercentage'],
      dtype='object')

compute latitude and longitude

In [29]:
location = df_clean.Location.str[7:-1]
df_clean.Location.str.split(' ',expand=True)
df_clean[['Latitude','Longitude']]=location.str.split(' ',expand=True)
df_clean['Longitude'] = pd.to_numeric(df_clean['Longitude'])
df_clean['Latitude'] = pd.to_numeric(df_clean['Latitude'])
df_clean.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'ParkingCategory', 'Location', 'OccupancyPercentage',
       'AvailableSpace', 'AvailablePercentage', 'Longitude', 'Latitude',
       'DayOfTheWeek', 'Hour', 'MinuteOfTheDay'],
      dtype='object')

Compute datetime information. 

1. Convert a string of time into a datetime object
2. Compute day of the week, hour of the day and minute of the day
3. sort by datetime

In [11]:
df_clean['OccupancyDateTime'] = pd.to_datetime(df_clean['OccupancyDateTime'],infer_datetime_format=True)
df_clean['DayOfTheWeek'] = df_clean['OccupancyDateTime'].dt.day_of_week
df_clean['Hour'] = df_clean['OccupancyDateTime'].dt.hour
minute = df_clean['OccupancyDateTime'].dt.minute
df_clean['MinuteOfTheDay'] = df_clean['Hour'] * 60 + minute 

In [12]:
# sort the whole dataframe by chronological order
df_clean = df_clean.sort_values(by=['OccupancyDateTime'])

encode categorical features inclduing paid parking area and parking category

In [13]:
cleanup_nums = {"PaidParkingArea":     
                    {"South Lake Union": 1, 
                    "Denny Triangle": 2,
                    "First Hill": 3,
                    "Capitol Hill": 4},
                "ParkingCategory": 
                    {"Paid Parking": 1, 
                    "Restricted Parking Zone": 2,
                    "Carpool Parking": 3}}
df_clean = df_clean.replace(cleanup_nums)

In [30]:
df_for_model = df_clean[['DayOfTheWeek','MinuteOfTheDay',
'Latitude','Longitude','AvailableSpace']]
df_for_model

Unnamed: 0,DayOfTheWeek,MinuteOfTheDay,Latitude,Longitude,AvailableSpace
28459351,1,720,-122.330015,47.616454,0
22643110,1,720,-122.321955,47.604256,8
26832891,1,720,-122.330488,47.619229,4
27987115,1,720,-122.323533,47.612504,8
28147181,1,720,-122.323931,47.608905,0
...,...,...,...,...,...
19648437,2,1319,-122.325646,47.615801,6
26993283,2,1319,-122.320969,47.619324,3
1662253,2,1319,-122.324585,47.615804,7
5239421,2,1319,-122.320363,47.621941,4


In [31]:
# df_for_model.to_csv('data/four_feature_clean.csv.gz', compression='gzip')

In [32]:
df_for_visual = df_clean[['DayOfTheWeek','MinuteOfTheDay','Hour',
'Latitude','Longitude', 'Location', 'ParkingTimeLimitCategory', 
'AvailableSpace','AvailablePercentage','PaidParkingArea']]
# df_for_visual.to_csv('data/clean_data_visual.csv.gz', compression='gzip')

# Coordinates data

In [33]:
reverse_ppa = {"PaidParkingArea":     
                    {1: "South Lake Union", 
                    2: "Denny Triangle",
                    3: "First Hill",
                    4: "Capitol Hill"}}

loc_df = df_clean[['Latitude','Longitude', 'BlockfaceName', 'SourceElementKey','SideOfStreet','ParkingSpaceCount','PaidParkingArea']]
loc_df = loc_df.drop_duplicates(subset=['SourceElementKey'], keep='first')
loc_df = loc_df.set_index('SourceElementKey')
loc_df = loc_df.replace(reverse_ppa)

In [35]:
# loc_df.to_csv('data/paystub_location.csv') 