In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/cleaned_complaints_deduplicated.csv')

In [12]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,SR Number,Date/Time Opened,Service Request Address,Service Request Status,Answer,street_address,zipcode,apt_num,datetime_opened,year,month_year,date,hour,city,state
0,0,0,SR18-00198455,12/19/18 02:53 PM,,Completed,No Cause,,,,2018-12-19 14:53:00,2018,12/2018,2018-12-19,14,Chicago,Illinois
1,1,1,SR19-01043676,02/20/19 01:40 PM,1135 N Harlem AVE<br> 60302,Completed,No Cause,1135 N Harlem AVE,60302.0,,2019-02-20 13:40:00,2019,02/2019,2019-02-20,13,Chicago,Illinois
2,2,2,SR19-01047333,02/21/19 08:56 AM,18231 S Sayre AVE<br> 60477,Completed,No Cause,18231 S Sayre AVE,60477.0,,2019-02-21 08:56:00,2019,02/2019,2019-02-21,8,Chicago,Illinois
3,3,3,SR19-01050631,02/21/19 03:29 PM,6726 N SHERIDAN RD<br> 60626,Completed,Processed for Hearing - Standard,6726 N SHERIDAN RD,60626.0,,2019-02-21 15:29:00,2019,02/2019,2019-02-21,15,Chicago,Illinois
4,4,4,SR19-01058995,02/23/19 11:35 AM,2746 N 74th AVE<br> 60707,Completed,No Cause,2746 N 74th AVE,60707.0,,2019-02-23 11:35:00,2019,02/2019,2019-02-23,11,Chicago,Illinois


## Identify repeat addresses

Addresses grouped by the total number of complaints

In [3]:
# group by address
address_grouped = df.groupby('street_address').size().reset_index(name='complaints').sort_values('complaints', ascending=False)

# there are 2938 unique addresses with more than one complaint
address_grouped[address_grouped['complaints'] > 1]

Unnamed: 0,street_address,complaints
8722,860 N DEWITT PL,52
823,1343 N CLEVELAND AVE,46
1801,2 E 8TH ST,46
3934,4045 N LAPORTE AVE,34
3867,4000 W DIVERSEY AVE,31
...,...,...
7446,7270 S SOUTH SHORE DR,2
7447,728 N SPAULDING AVE,2
8473,8200 S ESCANABA AVE,2
8489,821 E 82ND ST,2


In [4]:
repeat_addresses_by_complaint_count = address_grouped[address_grouped['complaints'] > 1].copy()

In [5]:
# export repeat address count
repeat_addresses_by_complaint_count.to_csv('../output/repeat_addresses_by_complaint_count.csv')

In [29]:
# export address count
address_grouped.to_csv('../output/addresses_by_complaint_count.csv')

In [6]:
# pivot complaint county by year and address
addresses_by_complaints_and_year = pd.pivot_table(
    df,
    values='SR Number',
    index='street_address',
    columns='year',
    aggfunc='count').reset_index()

addresses_by_complaints_and_year

year,street_address,2019,2020,2021,2022,2023,2024
0,1 E 114TH ST,,1.0,,,,
1,1 E 8TH ST,,,,1.0,,
2,1 N WACKER DR,,,1.0,,,
3,1 W SUPERIOR ST,,,,1.0,1.0,
4,10 E ONTARIO ST,,,,1.0,,
...,...,...,...,...,...,...,...
9079,9933 S VAN VLISSINGEN RD,,,1.0,,,
9080,9937 S AVENUE J,,1.0,,,,
9081,9947 S YALE AVE,,,1.0,,,
9082,9951 S OGLESBY AVE,,,,1.0,,


In [7]:
# replace NaN with zero so I can total
addresses_by_complaints_and_year.fillna(0,inplace=True)

# add a total column to pivot table
addresses_by_complaints_and_year['total'] = addresses_by_complaints_and_year[2019] + addresses_by_complaints_and_year[2020] + addresses_by_complaints_and_year[2021] + addresses_by_complaints_and_year[2022] + addresses_by_complaints_and_year[2023] + addresses_by_complaints_and_year[2024]

In [8]:
# export into output
addresses_by_complaints_and_year.to_csv('../output/addresses_by_complaints_and_year.csv')

## Count the number of unique days of complaints by address

In [16]:
# group by unique days
address_unique_days = df.groupby('street_address')['date'].nunique().reset_index(name='count of unique days').sort_values('count of unique days', ascending=False)
address_unique_days.head()

Unnamed: 0,street_address,count of unique days
823,1343 N CLEVELAND AVE,36
3934,4045 N LAPORTE AVE,31
8722,860 N DEWITT PL,28
8885,907 W ARGYLE ST,26
2608,2703 N CLARK ST,26


In [19]:
# export into output
address_unique_days.to_csv('../output/addresses_by_unique_days.csv')

## Count complaints during cold snap

Defining January 2024's cold snap as Jan. 14-16 based on weather records when the average temp was near or below freezing in Fahrenheit

In [28]:
# filter df for just those three days
mask = (df['date'] == '2024-01-14') | (df['date'] == '2024-01-15') | (df['date'] == '2024-01-16')
len(df[mask])

681

In [40]:
coldsnap_address_counts = df[mask].groupby('street_address').size().reset_index(name='complaints_during_coldsnap').sort_values('complaints_during_coldsnap', ascending=False)
coldsnap_address_counts.head()

Unnamed: 0,street_address,complaints_during_coldsnap
475,860 N DEWITT PL,13
120,235 W VAN BUREN ST,10
348,6040 S HARPER AVE,9
98,2 E 8TH ST,9
145,2801 S DR MARTIN LUTHER KING JR DR,9


In [41]:
# export to output
coldsnap_address_counts.to_csv('../output/coldsnap_address_counts.csv')

## Prep a list of addresses and relevant info for reporting

In [46]:
# merge address grouping analysis into one file 
merge1 = addresses_by_complaints_and_year.merge(coldsnap_address_counts, on='street_address', how='outer')
addresses_for_reporting = merge1.merge(address_unique_days, on='street_address', how='outer')

In [49]:
# export 
addresses_for_reporting.to_csv('../output/addresses_for_reporting.csv')

## Number of complaints over time

In [9]:
complaints_by_month = df.groupby('month_year').size().reset_index()
complaints_by_month

Unnamed: 0,month_year,0
0,01/2020,424
1,01/2021,456
2,01/2022,1029
3,01/2023,498
4,01/2024,1319
5,02/2019,51
6,02/2020,315
7,02/2021,984
8,02/2022,405
9,02/2023,305


In [10]:
# export as complaints by month
complaints_by_month.to_csv('../output/complaints_by_month.csv')

In [None]:
# TODO - missing 17% of complaint zipcodes though

In [11]:
complaints_by_zip = df.groupby('zipcode').size().reset_index(name='complaints').sort_values('complaints', ascending=False)
complaints_by_zip

Unnamed: 0,zipcode,complaints
49,60649.0,1025
22,60619.0,842
38,60637.0,673
23,60620.0,576
45,60644.0,522
...,...,...
7,60604.0,3
1,60153.0,1
3,60477.0,1
2,60302.0,1


In [None]:
## Number of complaints 