# Data Acquisition, Cleaning, and Integration
### Project: Mapping and Modeling Drug-Related Deaths in Cook County

This notebook demonstrates interim progress on data acquisition, cleaning, and integration. There is secondary notebook that will conduct EDA.  
It documents the steps taken so far to prepare datasets for spatial and demographic analysis, as outlined in our Project Plan.


In [11]:
# imports

import pandas as pd
import requests
import hashlib
import os

from pathlib import Path


In [None]:
# Move working directory to project root
project_root = Path(".").resolve().parent 
os.chdir(project_root)

print("Current working directory:", os.getcwd())

Current working directory: /Users/aaliyahali/Desktop/is477/IS477-FinalProject


In [15]:
# scripts/acquire_data.py

df_me = pd.read_csv("data/raw/Medical_Examiner_Case_Archive_20251104.csv", low_memory=False)

In [5]:
os.makedirs("data/raw", exist_ok=True)

# ACS 5-Year 2023 (data collected 2019–2023)
url = "https://api.census.gov/data/2023/acs/acs5"

variables = [
    # Population + race
    "B01003_001E",  # total population
    "B03002_003E",  # white (non-Hispanic)
    "B03002_004E",  # black
    "B03002_006E",  # asian
    "B03002_012E",  # hispanic

    # Income & poverty
    "B19013_001E",  # median income
    "B17001_002E",  # below poverty

    # Education
    "B15003_022E",  # bachelor's degree or higher

    # Age structure (male + female counts)
    "B01001_003E", "B01001_027E",  # under 18
    "B01001_007E", "B01001_031E",  # 18–34
    "B01001_010E", "B01001_034E",  # 35–64
    "B01001_020E", "B01001_044E",  # 65+

    # New: Unemployment, Health Insurance, Rent
    "B23025_002E",  # labor force
    "B23025_005E",  # unemployed
    "B27010_017E",  # no health insurance (all people)
    "B25064_001E"   # median gross rent
]

params = {
    "get": ",".join(["NAME"] + variables),
    "for": "tract:*",
    "in": "county:031 state:17"  # Cook County, IL
}

r = requests.get(url, params=params)
r.raise_for_status()

data = r.json()
cols = data[0]
rows = data[1:]
df_census = pd.DataFrame(rows, columns=cols)

# Rename columns for clarity
df_census = df_census.rename(columns={
    "B01003_001E": "TotalPop",
    "B03002_003E": "White_NonHisp",
    "B03002_004E": "Black",
    "B03002_006E": "Asian",
    "B03002_012E": "Hispanic",
    "B19013_001E": "MedianIncome",
    "B17001_002E": "BelowPoverty",
    "B15003_022E": "BachelorsOrHigher",
    "B01001_003E": "Male_Under18",
    "B01001_027E": "Female_Under18",
    "B01001_007E": "Male_18_34",
    "B01001_031E": "Female_18_34",
    "B01001_010E": "Male_35_64",
    "B01001_034E": "Female_35_64",
    "B01001_020E": "Male_65plus",
    "B01001_044E": "Female_65plus",
    "B23025_002E": "LaborForce",
    "B23025_005E": "Unemployed",
    "B27010_017E": "NoHealthInsurance",
    "B25064_001E": "MedianRent",
    "state": "STATE",
    "county": "COUNTY",
    "tract": "TRACT"
})

# Unique tract ID for merging
df_census["TRACT_FIPS"] = df_census["STATE"] + df_census["COUNTY"] + df_census["TRACT"].str.zfill(6)

df_census.to_csv("data/raw/census_tract_data.csv", index=False)
print(f"Saved enriched Census data for {len(df_census)} tracts.")


Saved enriched Census data for 1332 tracts.


In [6]:
df_census.head()

Unnamed: 0,NAME,TotalPop,White_NonHisp,Black,Asian,Hispanic,MedianIncome,BelowPoverty,BachelorsOrHigher,Male_Under18,...,Male_65plus,Female_65plus,LaborForce,Unemployed,NoHealthInsurance,MedianRent,STATE,COUNTY,TRACT,TRACT_FIPS
0,Census Tract 101; Cook County; Illinois,3726,1297,1376,137,809,69460,508,775,88,...,108,56,2579,260,130,1252,17,31,10100,17031010100
1,Census Tract 102.01; Cook County; Illinois,7588,1406,2301,376,2622,49639,1892,684,455,...,143,119,4007,317,40,1333,17,31,10201,17031010201
2,Census Tract 102.02; Cook County; Illinois,2609,967,949,212,423,55119,524,656,9,...,24,31,1449,135,36,1292,17,31,10202,17031010202
3,Census Tract 103; Cook County; Illinois,6311,3094,1298,342,1426,65871,431,1585,148,...,26,84,3923,166,0,1257,17,31,10300,17031010300
4,Census Tract 104; Cook County; Illinois,4282,3173,296,324,340,49017,463,999,25,...,83,18,2282,78,16,1248,17,31,10400,17031010400


In [7]:
df_me.head()

NameError: name 'df_me' is not defined

### Exploratory Data Analysis 

In [None]:
len(df_me)

92642

In [None]:
df_me.dtypes

Case Number                object
Date of Incident           object
Date of Death              object
Age                       float64
Gender                     object
Race                       object
Latino                       bool
Manner of Death            object
Primary Cause              object
Primary Cause Line A       object
Primary Cause Line B       object
Primary Cause Line C       object
Secondary Cause            object
Gun Related                object
Opioid Related             object
Cold Related                 bool
Heat Related                 bool
Commissioner District     float64
Incident Address           object
Incident City              object
Incident Zip Code          object
longitude                 float64
latitude                  float64
location                   object
Residence City             object
Residence Zip              object
OBJECTID                    int64
Chicago Ward              float64
Chicago Community Area     object
COVID Related 

In [None]:
df_me.columns

Index(['Case Number', 'Date of Incident', 'Date of Death', 'Age', 'Gender',
       'Race', 'Latino', 'Manner of Death', 'Primary Cause',
       'Primary Cause Line A', 'Primary Cause Line B', 'Primary Cause Line C',
       'Secondary Cause', 'Gun Related', 'Opioid Related', 'Cold Related',
       'Heat Related', 'Commissioner District', 'Incident Address',
       'Incident City', 'Incident Zip Code', 'longitude', 'latitude',
       'location', 'Residence City', 'Residence Zip', 'OBJECTID',
       'Chicago Ward', 'Chicago Community Area', 'COVID Related'],
      dtype='object')

## Cleaning + Pre-Processing

### Census Data

In [None]:
df_census.isna().sum()
# no missing values

NAME                 0
TotalPop             0
White_NonHisp        0
Black                0
Asian                0
Hispanic             0
MedianIncome         0
BelowPoverty         0
BachelorsOrHigher    0
Male_Under18         0
Female_Under18       0
Male_18_34           0
Female_18_34         0
Male_35_64           0
Female_35_64         0
Male_65plus          0
Female_65plus        0
LaborForce           0
Unemployed           0
NoHealthInsurance    0
MedianRent           0
STATE                0
COUNTY               0
TRACT                0
TRACT_FIPS           0
dtype: int64

In [None]:
df_census.head()

Unnamed: 0,NAME,TotalPop,White_NonHisp,Black,Asian,Hispanic,MedianIncome,BelowPoverty,BachelorsOrHigher,Male_Under18,...,Male_65plus,Female_65plus,LaborForce,Unemployed,NoHealthInsurance,MedianRent,STATE,COUNTY,TRACT,TRACT_FIPS
0,Census Tract 101; Cook County; Illinois,3726,1297,1376,137,809,69460,508,775,88,...,108,56,2579,260,130,1252,17,31,10100,17031010100
1,Census Tract 102.01; Cook County; Illinois,7588,1406,2301,376,2622,49639,1892,684,455,...,143,119,4007,317,40,1333,17,31,10201,17031010201
2,Census Tract 102.02; Cook County; Illinois,2609,967,949,212,423,55119,524,656,9,...,24,31,1449,135,36,1292,17,31,10202,17031010202
3,Census Tract 103; Cook County; Illinois,6311,3094,1298,342,1426,65871,431,1585,148,...,26,84,3923,166,0,1257,17,31,10300,17031010300
4,Census Tract 104; Cook County; Illinois,4282,3173,296,324,340,49017,463,999,25,...,83,18,2282,78,16,1248,17,31,10400,17031010400


In [None]:
df_census.dtypes

NAME                 object
TotalPop             object
White_NonHisp        object
Black                object
Asian                object
Hispanic             object
MedianIncome         object
BelowPoverty         object
BachelorsOrHigher    object
Male_Under18         object
Female_Under18       object
Male_18_34           object
Female_18_34         object
Male_35_64           object
Female_35_64         object
Male_65plus          object
Female_65plus        object
LaborForce           object
Unemployed           object
NoHealthInsurance    object
MedianRent           object
STATE                object
COUNTY               object
TRACT                object
TRACT_FIPS           object
dtype: object

In [None]:
# convert string columns to numeric values for computational purposes
num_cols = [c for c in df_census.columns if c not in ["NAME", "STATE", "COUNTY", "TRACT", "TRACT_FIPS"]] 
df_census[num_cols] = df_census[num_cols].apply(pd.to_numeric, errors="coerce")

In [8]:
df_census.dtypes

NAME                 object
TotalPop             object
White_NonHisp        object
Black                object
Asian                object
Hispanic             object
MedianIncome         object
BelowPoverty         object
BachelorsOrHigher    object
Male_Under18         object
Female_Under18       object
Male_18_34           object
Female_18_34         object
Male_35_64           object
Female_35_64         object
Male_65plus          object
Female_65plus        object
LaborForce           object
Unemployed           object
NoHealthInsurance    object
MedianRent           object
STATE                object
COUNTY               object
TRACT                object
TRACT_FIPS           object
dtype: object

In [9]:
# age group totals for analysis 
df_census["Age_Under18"] = df_census["Male_Under18"] + df_census["Female_Under18"]
df_census["Age_18_34"] = df_census["Male_18_34"] + df_census["Female_18_34"]
df_census["Age_35_64"] = df_census["Male_35_64"] + df_census["Female_35_64"]
df_census["Age_65plus"] = df_census["Male_65plus"] + df_census["Female_65plus"]

In [10]:
# demographic porportions 
df_census["Pct_Black"] = df_census["Black"] / df_census["TotalPop"] * 100
df_census["Pct_Hispanic"] = df_census["Hispanic"] / df_census["TotalPop"] * 100
df_census["Pct_Asian"] = df_census["Asian"] / df_census["TotalPop"] * 100
df_census["Pct_WhiteNonHisp"] = df_census["White_NonHisp"] / df_census["TotalPop"] * 100

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
# education proportions
df_census["Pct_BelowPoverty"] = df_census["BelowPoverty"] / df_census["TotalPop"] * 100
df_census["Pct_BachelorsPlus"] = df_census["BachelorsOrHigher"] / df_census["TotalPop"] * 100

#unemployment + insurance proportions
df_census["Pct_Unemployed"] = (df_census["Unemployed"] / df_census["LaborForce"]) * 100
df_census["Pct_Uninsured"] = (df_census["NoHealthInsurance"] / df_census["TotalPop"]) * 100

In [None]:
df_census.to_csv("data/processed/census_data_cleaned.csv", index=False)

### ME Data

In [13]:
df_me.head()

NameError: name 'df_me' is not defined

In [12]:
# AFTER CLEANING, save ME data to this!!!


# df_me = pd.read_csv('data/raw/cook_county_medical_examiner.csv')
# df_census = pd.read_csv('data/raw/census_tract_data.csv')

# # Save processed copy and compute checksum
# df_me.to_csv('data/processed/me_clean.csv', index=False)
# with open('data/processed/me_clean.csv', 'rb') as f:
#     checksum = hashlib.sha256(f.read()).hexdigest()
# print("Checksum:", checksum)