In [44]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import fnmatch
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 12]  # add more space to figures

#### Events Dataset Descriptive Analysis and Preprocessing

In [25]:
events = pd.read_csv('data/geocoded_events.csv')
events.head()

Unnamed: 0,Name of Event,Category,Date(s),Location,Estimated Size of Event,Number of People/ Capacity of Space,Source for # People,Source Org Title,Article Title,Mention of Police Presence (Y/N),...,Source 2 URL,Address,Venue Capacity,Venue Type,Completed,Main Source for Search,Notes_y,Source,Latitude,Longitude
0,LANY,Music,2018-08-01 00:00:00,Bottom Lounge,Small,330,https://bottomlounge.com/bottom-lounge-for-pri...,Concert Archives,Chicago's Concert History,,...,,,,,,,,,,
1,Grace Weber / Malcolm London - Lollapalooza Af...,Music,2018-08-01 00:00:00,Lincoln Hall,Medium,500,https://chicagomusic.fandom.com/wiki/Lincoln_Hall,Concert Archives,Chicago's Concert History,,...,,,,,,,,,,
2,Chase Atlantic,Music,2018-08-01 00:00:00,Reggies,Small,300,https://www.songkick.com/concerts/39931428-dri...,Concert Archives,Chicago's Concert History,,...,,,,,,,,,,
3,Alison Wonderland,Music,2018-08-24 00:00:00,Aragon Ballroom,,4800,https://www.bizbash.com/venues-destinations/un...,,,,...,https://www.jambase.com/show/alison-wonderland...,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,Yes,"Concert Archives, do312","this list is likely not comprehensive, but inc...",https://www.bizbash.com/venues-destinations/un...,41.857681,-87.657392
4,"Virtual Self ""Utopia System""",Music,2018-09-07 00:00:00,Aragon Ballroom,,4800,https://www.bizbash.com/venues-destinations/un...,,,,...,https://www.chicago-theater.com/theaters/arago...,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,Yes,"Concert Archives, do312","this list is likely not comprehensive, but inc...",https://www.bizbash.com/venues-destinations/un...,41.857681,-87.657392


In [26]:
events.columns

Index(['Name of Event', 'Category', 'Date(s)', 'Location',
       'Estimated Size of Event', 'Number of People/ Capacity of Space',
       'Source for # People', 'Source Org Title', 'Article Title',
       'Mention of Police Presence (Y/N)', 'Source for police presence',
       'Official Media Presence', 'Notes_x', 'Source 1 URL', 'Source 2 URL',
       'Address', 'Venue Capacity', 'Venue Type', 'Completed',
       'Main Source for Search', 'Notes_y', 'Source', 'Latitude', 'Longitude'],
      dtype='object')

In [27]:
events.Category.value_counts(dropna=False)

Music                      897
Sports                     275
Event                       36
Film Screening              21
Festival                    19
Dance                       15
Screening                   14
Convention                   8
Comedy                       8
Fundraiser                   7
Farmer's Market              6
Theater                      5
Event/Festival               5
Events                       3
Book Reading                 2
Festival, Music              2
Music, Festival              2
Play                         2
Music, Event                 2
Food Festival                1
Concert/Event                1
Music, Theater, Dance        1
Art                          1
Music, Fundraiser            1
Political Event              1
Music, Art                   1
Event, Fundraiser            1
Event, Art Installation      1
Music, Film Screening        1
Conference                   1
Event,  Fundraiser           1
Film                         1
Comedy, 

In [28]:
events["Estimated Size of Event"].value_counts(dropna=False)

Medium    679
Large     492
NaN       149
Small      25
Name: Estimated Size of Event, dtype: int64

In [14]:
pd.to_numeric(events["Number of People/ Capacity of Space"])

ValueError: Unable to parse string "105000 per day" at position 23

In [15]:
events.iloc[23]

Name of Event                                                          Lollapalooza 2018
Category                                                                           Music
Date(s)                                                                8/2/2018-8/5/2018
Location                                                                      Grant Park
Estimated Size of Event                                                            Large
Number of People/ Capacity of Space                                       105000 per day
Source for # People                    https://chicago.suntimes.com/2018/8/6/18444782...
Source Org Title                                                        Concert Archives
Article Title                                                  Chicago's Concert History
Mention of Police Presence (Y/N)                                                       Y
Source for police presence             https://chicago.suntimes.com/2018/8/6/18444782...
Official Media Presen

So we need to handle date ranges.

In [29]:
events["Address"].value_counts(dropna=False)

1901 W Madison St, Chicago, IL 60612                                       141
2051 N Milwaukee Ave, Chicago, IL 60647                                    138
1807 S Allport St, Chicago, IL 60608                                        93
329 N Dearborn St, Chicago, IL 60654                                        80
220 S Michigan Ave, Chicago, IL 60604                                       79
3730 N Clark St, Chicago, IL 60613                                          76
1106 W Lawrence Ave, Chicago, IL 60640                                      71
201 E Randolph St, Chicago, IL 60601                                        69
3635 N Clark St, Chicago, IL 60613                                          66
4746 N Racine Ave, Chicago, IL 60640                                        61
NaN                                                                         54
1060 W Addison St, Chicago, IL 60613                                        53
525 S Racine Ave, Chicago, IL 60607                 

In [30]:
events.Completed.value_counts(dropna=False)

Yes                                                                 1086
Completed Concert Archives, need check for other events on do312     168
NaN                                                                   56
Yes                                                                   22
Return                                                                 8
Yes*                                                                   3
Completed except for do312                                             2
Name: Completed, dtype: int64

In [32]:
events.Completed.value_counts(dropna=False).index

Index([                                                             'Yes',
       'Completed Concert Archives, need check for other events on do312',
                                                                      nan,
                                                                   'Yes ',
                                                                 'Return',
                                                                   'Yes*',
                                             'Completed except for do312'],
      dtype='object')

In [33]:
# replace Yes with space with Yes
events.Completed = events.Completed.replace(to_replace=['Yes ','Yes*'], value="Yes")
events.Completed.value_counts(dropna=False)

Yes                                                                 1111
Completed Concert Archives, need check for other events on do312     168
NaN                                                                   56
Return                                                                 8
Completed except for do312                                             2
Name: Completed, dtype: int64

This is good news - most of the events were at completed venues (ie venues where all known events were logged in the time period of interest). This is important because we _cannot_ use uncompleted venues in our planned analysis, because this means that there might be unknown events occurring in the time periods that we use as control.

In [35]:
# discard uncompleted venues
full_data_events = events[(events.Completed == 'Yes')]

full_data_events.columns

Index(['Name of Event', 'Category', 'Date(s)', 'Location',
       'Estimated Size of Event', 'Number of People/ Capacity of Space',
       'Source for # People', 'Source Org Title', 'Article Title',
       'Mention of Police Presence (Y/N)', 'Source for police presence',
       'Official Media Presence', 'Notes_x', 'Source 1 URL', 'Source 2 URL',
       'Address', 'Venue Capacity', 'Venue Type', 'Completed',
       'Main Source for Search', 'Notes_y', 'Source', 'Latitude', 'Longitude'],
      dtype='object')

In [36]:
full_data_events['Mention of Police Presence (Y/N)'].value_counts(dropna=False)

NaN    1110
Y         1
Name: Mention of Police Presence (Y/N), dtype: int64

In [37]:
full_data_events['Official Media Presence'].value_counts(dropna=False)

NaN    1110
Y         1
Name: Official Media Presence, dtype: int64

In [39]:
full_data_events = full_data_events.drop(columns=[
        'Source for # People', 'Source Org Title', 'Article Title','Mention of Police Presence (Y/N)', 
        'Source for police presence', 'Official Media Presence', 'Notes_x', 'Source 1 URL', 'Source 2 URL',
        'Completed','Main Source for Search', 'Notes_y', 'Source'
    ]
    ).rename(columns={'Number of People/ Capacity of Space':"Number People"})
full_data_events

Unnamed: 0,Name of Event,Category,Date(s),Location,Estimated Size of Event,Number People,Address,Venue Capacity,Venue Type,Latitude,Longitude
3,Alison Wonderland,Music,2018-08-24 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
4,"Virtual Self ""Utopia System""",Music,2018-09-07 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
6,5 Seconds of Summer / The Aces,Music,2018-09-08 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
9,SIGRID W/ HOUSES,Music,2019-08-01 00:00:00,Thalia Hall,Medium,800,"1807 S Allport St, Chicago, IL 60608",800.0,Music,41.857681,-87.657392
10,JUDAH & THE LION W/ THE BAND CAMINO,Music,2019-08-02 00:00:00,Thalia Hall,Medium,800,"1807 S Allport St, Chicago, IL 60608",800.0,Music,41.857681,-87.657392
...,...,...,...,...,...,...,...,...,...,...,...
1340,Chicago Cubs vs Pittsburgh Pirates,Sports,7/12/2019 - 7/24/2019,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1341,Chicago Cubs vs Cincinnati,Sports,7/15/2019 - 7/17/2019,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1342,Chicago Cubs vs San Diego Padres,Sports,7/19/2019 - 7/21/2019,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1343,14TH ANNUAL RACE TO WRIGLEY 5K CHARITY RUN PRE...,Sports,2019-04-27 00:00:00,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392


In [56]:
# now drop multi-day events (for now)
single_day_events = full_data_events[full_data_events["Date(s)"].str.contains('\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d')]
single_day_events

Unnamed: 0,Name of Event,Category,Date(s),Location,Estimated Size of Event,Number People,Address,Venue Capacity,Venue Type,Latitude,Longitude
3,Alison Wonderland,Music,2018-08-24 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
4,"Virtual Self ""Utopia System""",Music,2018-09-07 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
6,5 Seconds of Summer / The Aces,Music,2018-09-08 00:00:00,Aragon Ballroom,,4800,"1106 W Lawrence Ave, Chicago, IL 60640",4800.0,Event Space,41.857681,-87.657392
9,SIGRID W/ HOUSES,Music,2019-08-01 00:00:00,Thalia Hall,Medium,800,"1807 S Allport St, Chicago, IL 60608",800.0,Music,41.857681,-87.657392
10,JUDAH & THE LION W/ THE BAND CAMINO,Music,2019-08-02 00:00:00,Thalia Hall,Medium,800,"1807 S Allport St, Chicago, IL 60608",800.0,Music,41.857681,-87.657392
...,...,...,...,...,...,...,...,...,...,...,...
1321,Chicago Cubs vs Los Angeles Dodgers,Sports,2019-03-20 00:00:00,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1322,Chicago Cubs vs San Francisco Giants,Sports,2019-03-21 00:00:00,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1334,Chicago Cubs vs Los Angeles Angels,Sports,2019-06-03 00:00:00,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392
1343,14TH ANNUAL RACE TO WRIGLEY 5K CHARITY RUN PRE...,Sports,2019-04-27 00:00:00,Wrigley Field,Large,41160,"1060 W Addison St, Chicago, IL 60613",41160.0,Sports,41.857681,-87.657392


In [None]:
# which venues had multi-day events?