In [31]:
import pandas as pd
import numpy as np

In [32]:
slu = pd.read_csv('data/slu-Paid_Parking_Occupancy__Last_30_Days_.csv')
# parking rate only contains null
slu = slu.drop(['PaidParkingRate'], axis = 1)

Add availability information

1. Calculate occupancy percentage: The date and time (minute) of the transaction as recorded
2. Aavailability (bool) : True if has available parking spaces. 
3. Available parking spaces (int): Number of availabel parking spaces. 0 if paid spaces are equal or more than the parking space count. A car might leave before the parking expire, thus paird occupancy could be higher than the parking space count. 

In [33]:
slu['OccupancyPercentage'] = slu['PaidOccupancy']/slu['ParkingSpaceCount']
slu['HasAvailability'] = slu.OccupancyPercentage < 1
slu['AvailableSpace'] = np.where(slu['ParkingSpaceCount'] - slu['PaidOccupancy'] > 0, slu['ParkingSpaceCount'] - slu['PaidOccupancy'], 0)

compute latitude and longitude

In [34]:
location = slu.Location.str[7:-1]
slu.Location.str.split(' ',expand=True)
slu[['Latitude','Longitude']]=location.str.split(' ',expand=True)
slu['Latitude'] = pd.to_numeric(slu['Latitude'])
slu['Longitude'] = pd.to_numeric(slu['Longitude'])

Compute datetime information. 

1. Convert a string of time into a datetime object
2. Check wheter a given date is a US holiday
3. Compute day of the week, hour of the day and minute of the day
5. sort by datetime

In [35]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import time
slu['OccupancyDateTime'] = pd.to_datetime(slu['OccupancyDateTime'],infer_datetime_format=True)
cal = calendar()

dr = pd.date_range(start=slu['OccupancyDateTime'].min(), end=slu['OccupancyDateTime'].max())
holidays = cal.holidays(start=dr.min(), end=dr.max())
slu['Holiday'] = slu['OccupancyDateTime'].isin(holidays)

slu['DayOfTheWeek'] = slu['OccupancyDateTime'].dt.day_of_week
slu['Year'] = slu['OccupancyDateTime'].dt.year
slu['Month'] = slu['OccupancyDateTime'].dt.month
slu['Day'] = slu['OccupancyDateTime'].dt.day
slu['Hour'] = slu['OccupancyDateTime'].dt.hour
slu['Minute'] = slu['OccupancyDateTime'].dt.minute
slu['Second'] = slu['OccupancyDateTime'].dt.second
slu['MinuteOfTheDay'] = slu['Hour'] * 60 * 60 + slu['Minute']  * 60

In [36]:
slu = slu.sort_values(by=['OccupancyDateTime'])

In [37]:
slu.to_csv('data/clean_data.csv', sep=',')

In [38]:
slu.columns

Index(['OccupancyDateTime', 'PaidOccupancy', 'BlockfaceName', 'SideOfStreet',
       'SourceElementKey', 'ParkingTimeLimitCategory', 'ParkingSpaceCount',
       'PaidParkingArea', 'PaidParkingSubArea', 'ParkingCategory', 'Location',
       'OccupancyPercentage', 'HasAvailability', 'AvailableSpace', 'Latitude',
       'Longitude', 'Holiday', 'DayOfTheWeek', 'Year', 'Month', 'Day', 'Hour',
       'Minute', 'Second', 'MinuteOfTheDay'],
      dtype='object')

In [39]:
loc_df = slu[['Latitude','Longitude', 'BlockfaceName', 'SourceElementKey','SideOfStreet']]
loc_df = loc_df.drop_duplicates(subset=['SourceElementKey'], keep='first')
loc_df = loc_df.set_index('SourceElementKey')

In [40]:
loc_df

Unnamed: 0_level_0,Latitude,Longitude,BlockfaceName,SideOfStreet
SourceElementKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35197,-122.336516,47.623268,REPUBLICAN ST BETWEEN TERRY AVE N AND BOREN AVE N,N
79790,-122.329810,47.624233,MERCER ST BETWEEN YALE AVE N AND EASTLAKE AVE E,S
8406,-122.340903,47.626663,8TH AVE N BETWEEN VALLEY ST AND ALOHA ST,E
57550,-122.331620,47.623680,PONTIUS AVE N BETWEEN REPUBLICAN ST AND MERCER ST,E
13110,-122.337055,47.621446,TERRY AVE N BETWEEN THOMAS ST AND HARRISON ST,E
...,...,...,...,...
58574,-122.335171,47.625641,VALLEY ST BETWEEN BOREN AVE N AND FAIRVIEW AVE N,S
81497,-122.339110,47.626219,WESTLAKE AVE N BETWEEN VALLEY ST AND 9TH AVE N,SW
14601,-122.327247,47.630477,E NELSON PL BETWEEN EASTLAKE AVE E AND DEAD END,NE
13138,-122.335108,47.620784,THOMAS ST BETWEEN BOREN AVE N AND FAIRVIEW AVE N,S


In [41]:
#loc_df.to_csv('data/paystub_location.csv') 

In [42]:
# loc_df[['Latitude',	'Longitude']].to_csv('data/paystub_coordinates.csv') 