In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data source: https://www.propublica.org/datastore/dataset/chicago-parking-ticket-data
# Original csv was 7.67 GB, which is too big for my desktop to handle, 
# here I randomly choose 20% of the original dataset to analyze.

df_CCPT = pd.read_csv('./data/parking_tickets_frac.csv')

In [3]:
# Drop columns that are not needed here
col2drop = ['Unnamed: 0','ticket_number','license_plate_number','zipcode',
          'unit','unit_description','notice_number','violation_location','violation_code']
df_CCPT.drop(labels=col2drop ,axis = 1,inplace = True)

In [4]:
# Drop columns that are not needed here
row2drop= ['license_plate_state']
df_CCPT.dropna(axis = 0, how = 'any', subset = row2drop, inplace = True)

In [6]:
df_CCPT['issue_date'] = pd.to_datetime(df_CCPT['issue_date'])
# Here I choose the data between year 2016 and 2017 as a demonstration for the plot
mask1617 = ((df_CCPT['issue_date'] >= pd.Timestamp(2016,1,1,0)) & (df_CCPT['issue_date'] <pd.Timestamp(2018,1,1,0)))
df_CCPT1617 = df_CCPT.loc[mask1617]

In [7]:
# Take a look at the address column here, try to convert it to lat,long for mapping the ticket
df_CCPT1617['address'].head()

10              500 e 46, chicago, il
13     6100 s archer ave, chicago, il
20    4200 w irving park, chicago, il
25          1700 w grand, chicago, il
42       2200 n st louis, chicago, il
Name: address, dtype: object

Seems that all the address follows the same format, here split them by address, city and state

In [8]:
address_df = pd.DataFrame(df_CCPT1617['address'].str.split(', ').tolist(), columns=['address','city','state','None'])
address_df['city'].value_counts()

chicago    889767
louis           1
Name: city, dtype: int64

In [10]:
mask = (address_df['city'] == 'louis')
address_df.loc[mask]
# found only one anomoly here, there's an additional comma in the address

Unnamed: 0,address,city,state,None
137135,600 n st,louis,chicago,il


In [11]:
mask_add = (df_CCPT1617['address'] == '600 n st, louis, chicago, il')
# Change it to correct form
df_CCPT1617['address'] = df_CCPT1617['address'].str.replace('600 n st, louis, chicago, il','600 n st louis, chicago, il')

In [12]:
# split the address string again
address_df = pd.DataFrame(df_CCPT1617['address'].str.split(',').tolist(), columns=['address','city','state'])

In [13]:
# check the number of unique address here
address_df['state'].unique()
print('Number of unique addresses: {}'.format(len(address_df['address'].unique())))

Number of unique addresses: 71689


In [14]:
# create a new df to save address, latitude and longitude
address_df = pd.DataFrame(address_df['address'].unique(), columns=['address'])
address_df['lat'] = np.nan
address_df['lng'] = np.nan
address_df.head()

Unnamed: 0,address,lat,lng
0,500 e 46,,
1,6100 s archer ave,,
2,4200 w irving park,,
3,1700 w grand,,
4,2200 n st louis,,


Use geocoder here to fetch the latitude and longitude for the ticket address

In [15]:
import geocoder
import requests
import time
import datetime

In [16]:
class GeoSessions:
    def __init__(self):
        self.Komoot = requests.Session()
        self.Arcgis = requests.Session()
        self.USCensus = requests.Session()
        self.OSM = requests.Session()

def create_sessions():
    return GeoSessions()

def geocode_address(address, s):
    g = geocoder.komoot(address, session=s.Komoot)
    if (g.ok == False):        
        g = geocoder.osm(address, session=s.OSM)
    if (g.ok == False):
        g = geocoder.arcgis(address, session=s.Arcgis)
    if (g.ok == False):
        g = geocoder.uscensus(address, session=s.USCensus)
    return g

In [17]:
s = create_sessions()

In [18]:
# test with the first address
addr = address_df['address'][0] + ', Chicago, IL'
tmp = geocode_address(addr, s)
print(addr)
print('Latitude:  {}'.format(tmp.lat))
print('Longitude: {}'.format(tmp.lng))

500 e 46, Chicago, IL
Latitude:  41.8349087
Longitude: -87.61417004242811


In [19]:
# set a loop to fetch lat,long for all addresses, since this process takes a while, 
# later I will load the pre-saved file, this is just a demonstration
failed_inds = []
iter_between_saves = 100

In [20]:
missing_latlon_ind = address_df[address_df['lat'].isnull()][:201].index.tolist()

for i in missing_latlon_ind:
    try:
        if i%iter_between_saves == 0:
            print('got to index {}, saving df'.format(i))
            address_df.to_csv('Geocoded_CCPT_1617.csv', index=False)
        addr = address_df['address'][i] + ', Chicago, IL'
        tmp = geocode_address(addr, s)
        address_df.loc[i,'lat'] = tmp.lat
        address_df.loc[i,'lng'] = tmp.lng
    except Exception as e:
        failed_inds.append(i)
address_df.to_csv('Geocoded_CCPT_1617.csv', index=False)

got to index 0, saving df
got to index 100, saving df
got to index 200, saving df


In [21]:
address_df[address_df['lat'].isnull()]

Unnamed: 0,address,lat,lng
201,1 s wells,,
202,6900 n glenwood,,
203,2400 n mango,,
204,4500 n springfield,,
205,1 w 19th st,,
206,2100 e 69th st,,
207,2000 s ashland ave,,
208,1400 n dearborn parkway,,
209,6300 s racine,,
210,900 w huron,,


In [22]:
df_geoloc = pd.read_csv('./data/Geocoded_CCPT_1617.csv')
df_geoloc['address'] = df_geoloc['address'] + ', chicago, il'

In [23]:
geocoded_df = pd.merge(left=df_CCPT1617, right=df_geoloc, how='inner', on='address')
geocoded_df.head(100)

Unnamed: 0,issue_date,license_plate_state,license_plate_type,violation_description,vehicle_make,fine_level1_amount,fine_level2_amount,current_amount_due,total_payments,ticket_queue,ticket_queue_date,notice_level,hearing_disposition,officer,address,lat,lng
0,2016-05-19 17:40:00,IL,PAS,REAR AND FRONT PLATE REQUIRED,LNDR,60,120,60.0,0.00,Define,2016-05-25 00:00:00,,,05238,"500 e 46, chicago, il",41.834909,-87.614170
1,2016-01-30 00:34:00,IL,TMP,EXPIRED PLATES OR TEMPORARY REGISTRATION,CHRY,60,120,146.4,0.00,Notice,2016-02-11 00:00:00,FINL,,10109,"500 e 46, chicago, il",41.834909,-87.614170
2,2016-11-05 20:35:00,IL,PAS,BLOCK ACCESS/ALLEY/DRIVEWAY/FIRELANE,HOND,150,300,0.0,150.00,Paid,2016-11-15 00:00:00,,,5121,"6100 s archer ave, chicago, il",41.794587,-87.770287
3,2016-05-27 17:30:00,IL,PAS,DISABLED PARKING ZONE,FORD,250,500,0.0,250.00,Paid,2016-06-07 00:00:00,,,5652,"6100 s archer ave, chicago, il",41.794587,-87.770287
4,2017-11-24 16:23:00,IL,PAS,BLOCK ACCESS/ALLEY/DRIVEWAY/FIRELANE,DODG,150,300,300.0,0.00,Notice,2017-12-21 00:00:00,FINL,,5652,"6100 s archer ave, chicago, il",41.794587,-87.770287
5,2017-04-24 18:31:00,IL,PAS,BLOCK ACCESS/ALLEY/DRIVEWAY/FIRELANE,FORD,150,300,0.0,150.00,Paid,2017-05-02 00:00:00,,,18502,"6100 s archer ave, chicago, il",41.794587,-87.770287
6,2017-08-20 04:35:00,IL,PAS,PARK OR STAND IN BUS/TAXI/CARRIAGE STAND,JEEP,100,200,100.0,0.00,Notice,2018-03-14 00:00:00,DETR,,23932,"4200 w irving park, chicago, il",41.953806,-87.732667
7,2017-04-15 02:02:00,IL,PAS,PARK OR STAND IN BUS/TAXI/CARRIAGE STAND,CHEV,100,200,244.0,0.00,Notice,2017-04-27 00:00:00,SEIZ,,6628,"4200 w irving park, chicago, il",41.953806,-87.732667
8,2016-04-12 23:44:00,IL,PAS,PARKING/STANDING PROHIBITED ANYTIME,HOND,75,150,0.0,150.00,Paid,2016-07-01 00:00:00,FINL,,6628,"4200 w irving park, chicago, il",41.953806,-87.732667
9,2016-07-09 15:58:00,IL,PAS,PARK OR STAND IN BUS/TAXI/CARRIAGE STAND,TOYT,100,200,0.0,100.00,Paid,2017-03-04 00:00:00,DETR,,23932,"4200 w irving park, chicago, il",41.953806,-87.732667


In [24]:
import folium
import folium.plugins as plugins

In [25]:
# pick a center point for the map
chicago_lat = 41.8
chicago_lng = -87.7

In [26]:
# Find the ticket with "RESIDENTIAL PERMIT PARKING" violation, 
# check if there's any pattern in terms of the ticket issued location
df_permit = geocoded_df.loc[geocoded_df['violation_description'] == 'RESIDENTIAL PERMIT PARKING']
df_permit

Unnamed: 0,issue_date,license_plate_state,license_plate_type,violation_description,vehicle_make,fine_level1_amount,fine_level2_amount,current_amount_due,total_payments,ticket_queue,ticket_queue_date,notice_level,hearing_disposition,officer,address,lat,lng
64,2017-11-17 18:07:00,IL,PAS,RESIDENTIAL PERMIT PARKING,TOYT,75,150,0.0,0.0,Dismissed,2018-01-11 00:00:00,VIOL,Not Liable,11371,"2200 n st louis, chicago, il",41.928285,-87.714662
720,2017-12-28 00:47:00,IL,PAS,RESIDENTIAL PERMIT PARKING,VOLK,75,150,0.0,75.0,Paid,2018-01-04 00:00:00,,,1640,"800 s oakley boulevard, chicago, il",41.871201,-87.684071
721,2016-02-12 00:45:00,IL,PAS,RESIDENTIAL PERMIT PARKING,NISS,75,150,0.0,150.0,Paid,2016-05-05 00:00:00,FINL,,1462,"800 s oakley boulevard, chicago, il",41.871201,-87.684071
722,2016-08-16 20:26:00,IL,PAS,RESIDENTIAL PERMIT PARKING,DODG,75,150,0.0,75.0,Paid,2016-08-25 00:00:00,,,1125,"1700 w cornelia, chicago, il",41.943056,-87.813918
725,2016-10-07 17:37:00,IL,PAS,RESIDENTIAL PERMIT PARKING,PONT,75,150,0.0,75.0,Paid,2016-11-07 00:00:00,VIOL,,767,"1700 w cornelia, chicago, il",41.943056,-87.813918
727,2017-09-08 18:38:00,IL,DOM,RESIDENTIAL PERMIT PARKING,SCIO,75,150,0.0,75.0,Paid,2017-09-18 00:00:00,VIOL,,1546,"1700 w cornelia, chicago, il",41.943056,-87.813918
728,2017-10-19 17:15:00,OH,PAS,RESIDENTIAL PERMIT PARKING,INFI,75,150,0.0,75.0,Paid,2017-12-14 00:00:00,VIOL,,767,"1700 w cornelia, chicago, il",41.943056,-87.813918
730,2016-08-09 17:15:00,IL,PAS,RESIDENTIAL PERMIT PARKING,JEEP,75,150,0.0,75.0,Paid,2016-09-07 00:00:00,VIOL,,803,"1700 w cornelia, chicago, il",41.943056,-87.813918
731,2017-07-08 19:51:00,IL,PAS,RESIDENTIAL PERMIT PARKING,HOND,75,150,0.0,75.0,Paid,2017-07-13 00:00:00,,,1546,"1700 w cornelia, chicago, il",41.943056,-87.813918
733,2016-08-09 17:13:00,IL,PAS,RESIDENTIAL PERMIT PARKING,VOLK,75,150,0.0,75.0,Paid,2016-08-26 00:00:00,VIOL,,803,"1700 w cornelia, chicago, il",41.943056,-87.813918


In [27]:
# only take the latitude and longitude
df_permit_hm = df_permit[['lat','lng']]

In [28]:
# There are quite some outliers, here I will just remove the ones outside Chicago, 
# later I will do the more careful cleaning

mask_chi = ((df_permit['lat']>=41.63) & (df_permit['lat']<= 42.05) &
               (df_permit['lng']>= -88.0) & (df_permit['lng']<= -87.5))
df_permit_hm = df_permit_hm.loc[mask_chi]

In [29]:
# create an array to plot heat map of the locations
permit_data = [[row['lat'],row['lng']] for index, row in df_permit_hm.iterrows()]

In [30]:
# plot the heatmap of residential permit parking violation
m = folium.Map([chicago_lat,chicago_lng],zoom_start=10)
hm = plugins.HeatMap(permit_data,radius = 8)
hm.add_to(m)
m

Figure 2. Heatmap of residential permit parking violation
The "heatmap" for residential permit parking violation shows an interesting contrast between the northern part and the southern part of the City of Chicago, where the parking violations in the northern city have a denser concentraion compared to the south. This could reflect the population density of Chicago. I found example data for Chicago population density (e.g. https://www.arcgis.com/home/webmap/viewer.html?webmap=dfa1866898254e8c8e4eb7b70af99ed4) that supports this hypothesis. 

In addition to this plot, I also plotted the temporal change of the residential permit parking tickets issued at different time of the day. Below is a plot shows the "heatmap" for residential permit parking tickets at different hours of the day. The "north denser than south" observation still stand through different hours of the day.

While it's not surprising that not a lot tickets are issued during the night, there appear to be two peaks of tickets issuance during the day, one around 8 am, the other around 8 pm. This could be related to how the residential parking restriction is set, or other reasons such as residents returning from work around 8 pm. 

In [31]:
df_permit['Weight'] = df_permit['issue_date'].dt.hour
df_permit['Weight'] = df_permit['Weight'].astype(float)
df_permit_hmt = df_permit[['lat','lng','Weight']]

In [32]:
permit_timeseries = [[[row['lat'],row['lng']] 
                      for index, row in df_permit_hmt[df_permit_hmt['Weight'] == i].iterrows()] for i in range(0,24)]

In [33]:
hours_indx = [(datetime.time(i).strftime('%I %p')) for i in range(24)]
mt = folium.Map([chicago_lat,chicago_lng],tiles = "Stamen Toner",zoom_start=10)
hmt = plugins.HeatMapWithTime(permit_timeseries,auto_play=True,radius = 5, min_opacity = 1,index = hours_indx)
hmt.add_to(mt)
mt