Zach Tretter

June 2020

--------

In [1]:
import pandas as pd
import os

# Step 03C - Air Quality

* Air Quality Index is only reported for Flathead County

* Glacier County does not have its own AQI

* 20 CSVs were downloaded from [epa.gov](https://www.epa.gov/outdoor-air-quality-data/air-quality-index-daily-values-report) and appeneded to a single dataframe
    * This was more efficient than automating a webscrape given the small number of files

![epa_air_quality_source.PNG](attachment:epa_air_quality_source.PNG)

#### Read in Air Quality Dataframe

In [2]:
df_aqi = pd.DataFrame()

aqi_csv_file_path = '../data/raw_source_data_CSVs/AQI_CSVs/'

for file in os.listdir(aqi_csv_file_path):
    df_aqi = df_aqi.append(
        pd.read_csv(aqi_csv_file_path + file)
    )

#### Rename Columns

In [3]:
df_aqi = df_aqi.rename(columns = {'Date':'date',
                                  'Overall AQI Value':'aqi',
                                  'Ozone':'ozone'})

#### Convert date to datetime and set date as index

In [4]:
df_aqi['date_aqi'] = df_aqi['date'] 
df_aqi['date'] = pd.to_datetime(df_aqi['date'])
df_aqi.set_index('date',inplace=True)
df_aqi.sort_index(inplace=True)

#### Drop Unnecessary Columns

In [5]:
df_aqi = df_aqi.drop(
    columns=[
#         'Main Pollutant',
        'Site Name (of Overall AQI)',
        'Site ID (of Overall AQI)',
        'Source (of Overall AQI)',
        'CO',
#         'Ozone',
#         'PM10',
#         'PM25'
    ]
)

#### Identify Non Numeric Values in Pollutants

In [6]:
df_aqi.loc[ (df_aqi['ozone']==".")
           | (df_aqi['PM10']==".") 
           | (df_aqi['PM25']=="."),].shape

(3242, 6)

#### Interpolate Missing Values

In [7]:
for pollutant in ['ozone','PM10','PM25']:
    df_aqi.loc[df_aqi[pollutant]==".",pollutant] = None
    df_aqi[pollutant] = df_aqi[pollutant].astype(float).interpolate(method='time').astype(int)

#### Confirm no non numeric values remain

In [8]:
df_aqi.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7305 entries, 2000-01-01 to 2019-12-31
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   aqi             7305 non-null   int64 
 1   Main Pollutant  7305 non-null   object
 2   ozone           7305 non-null   int32 
 3   PM10            7305 non-null   int32 
 4   PM25            7305 non-null   int32 
 5   date_aqi        7305 non-null   object
dtypes: int32(3), int64(1), object(2)
memory usage: 313.9+ KB


## Export Air Quality Data

In [9]:
df_aqi.to_csv('../data/03c_air_quality_clean.csv')