In [2]:
import pandas as pd
from pathlib import Path

### Description of the data:

This dataset represents daily air quality measurements in the United States for 2019 and 2020 in EPA’s Air Quality System (AQS, https://www.epa.gov/aqs) database in which both PM2.5 and ozone are measured concurrently.  These PM2.5 and ozone concentration data are joined with locational, meteorological, demographic information, and concentrations of other major air quality pollutants when available.  All of the data were downloaded from AQS with the exception of four demographic parameters (people of color, low income, linguistically isolated, and less than high school education) which come from EPA’s EJSCREEN tool (https://www.epa.gov/ejscreen).  These demographic parameters are at the Census "block group" level (area defined by the Census Bureau that usually has between 600 and 3,000 people) and listed in fractional units for the block group containing the monitor location. 

**Data Source**
https://www.kaggle.com/competitions/phase-ii-widsdatathon2022/data

In [3]:
data_dir = Path(r'../DATA')

In [4]:
epa_files = {}
for n, epa_file in enumerate(data_dir.glob('*.xlsx'), start=1):
    print(f'{n}) {epa_file.name}')
    yr = epa_file.stem.split('_')[-1]
    epa_files[yr] = pd.ExcelFile(epa_file)

1) Datathon_EPA_Air_Quality_Demographics_Meteorology_2019.xlsx
2) Datathon_EPA_Air_Quality_Demographics_Meteorology_2020.xlsx


In [5]:
%%time
epa_yr_dfs = []
for year, xl_file in epa_files.items():
    df = pd.read_excel(xl_file, sheet_name="Sheet1")
    df['year'] = year
    n_rows, n_cols = df.shape
    print(f'{year} data loaded: {n_rows} rows, {n_cols} cols')
    epa_yr_dfs.append(df)
epa_df = pd.concat(epa_yr_dfs)
n_rows, n_cols = epa_df.shape
print(f'combined data loaded: {n_rows} rows, {n_cols} cols')

2019 data loaded: 129470 rows, 23 cols
2020 data loaded: 133418 rows, 23 cols
combined data loaded: 262888 rows, 23 cols
CPU times: total: 2min 22s
Wall time: 2min 23s


In [6]:
for col in epa_df.columns:
    print(col)

AQS_ID
LATITUDE
LONGITUDE
COUNTY
STATE
CBSA
PEOPLE_OF_COLOR_FRACTION
LOW_INCOME_FRACTION
LINGUISTICALLY_ISOLATED_FRACTION
LESS_THAN_HS_ED_FRACTION
DATE
TEMPERATURE_CELSIUS
RELATIVE_HUMIDITY
WIND_SPEED_METERS_PER_SECOND
WIND_DIRECTION
PM25_UG_PER_CUBIC_METER
OZONE_PPM
NO2_PPB
CO_PPM
SO2_PPB
LEAD_UG_PER_CUBIC_METER
BENZENE_PPBC
year


In [7]:
print(epa_df['STATE'].value_counts())

California              49076
Pennsylvania            19940
Utah                    11388
Texas                    8968
Florida                  8531
Nevada                   7783
Arizona                  7160
Wisconsin                6861
Massachusetts            6765
Indiana                  6545
North Dakota             6451
Kentucky                 6444
Oklahoma                 6008
New Jersey               5753
Minnesota                5539
North Carolina           5421
Illinois                 5377
Maryland                 4915
Michigan                 4738
Wyoming                  4610
New York                 4534
New Mexico               4406
Montana                  4371
Ohio                     4329
Kansas                   4220
Colorado                 4138
New Hampshire            3788
South Dakota             3771
Mississippi              3538
Virginia                 3179
Georgia                  3034
Missouri                 3013
Connecticut              2908
Delaware  

In [8]:
ca_epa_df = epa_df[epa_df['STATE'] == 'California']

In [9]:
print(ca_epa_df['COUNTY'].value_counts())

Riverside          4192
Ventura            3528
Los Angeles        2841
Alameda            2803
Santa Barbara      2754
Fresno             2642
Monterey           2120
San Diego          1900
Sacramento         1831
Santa Clara        1426
Contra Costa       1402
San Luis Obispo    1363
Stanislaus         1362
Kern               1331
San Bernardino     1118
Orange              948
Imperial            897
Inyo                798
San Francisco       716
Solano              711
San Benito          705
Santa Cruz          703
Siskiyou            701
Marin               699
Merced              695
Kings               695
Sutter              690
San Mateo           687
Placer              687
Napa                681
San Joaquin         680
Tehama              677
Calaveras           672
Butte               663
Nevada              644
Madera              620
Sonoma              600
Tulare              232
Humboldt            201
Colusa              132
Shasta              120
Lake            