## In this notebook, we will use the Selenium webdriver to automatically download air pollution datasets for a particular year and state in the US.

In [1]:
import selenium as selenium
print(selenium.__version__)
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.keys import Keys
import subprocess
import json
import time as time
import os
import shutil

4.8.0


#### Automate web browser operations using the Selenium webdriver

In [2]:
# driver = webdriver.Chrome('../chromedriver_linux64/chromedriver')

driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))

# For selenium 4, refer to the link to launch the webdriver: https://pypi.org/project/webdriver-manager/

In [3]:
driver.get('https://www.epa.gov/outdoor-air-quality-data/download-daily-data')

In [4]:
f = open('polltant_names.json')
print(json.load(f))

['CO', 'Pb', 'NO2', 'Ozone', 'PM10', 'PM2.5', 'SO2']


In [5]:
f = open('state_names.json')
print(json.load(f))

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Puerto Rico']


In [6]:
selected_pollutant='NO2'
selected_state='California'

current=os.getcwd()

In [11]:
if os.path.exists(os.path.join(current,'DataAQ',selected_pollutant+'-'+selected_state)):
    print('directory already exists for selected pollutant and state')
    print('checking available data (.csv files)\n')
    yr=[i.split('-')[-1] for i in os.listdir(os.path.join(current,'DataAQ',selected_pollutant+'-'+selected_state))]
    print('data exists for ',yr,'\n')
    print('Run the following cell to download remaining data (if any) for the selected pollutant and state')
else:
    yr=[]


directory already exists for selected pollutant and state
checking available data (.csv files)

data exists for  ['1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'] 

Run the following cell to download remaining data (if any) for the selected pollutant and state


#### Automatically download and organize the CSV formatted data for the selected pollutant and state, for all available years
Data is downloaded to the DataAQ directory

In [12]:
# This cell will automatically download the CSV formatted data for the selected pollutant and state, for all available years

driver.refresh()

if not os.path.exists(f'DataAQ/{selected_pollutant}-{selected_state}'):
    os.makedirs(f'DataAQ/{selected_pollutant}-{selected_state}')

time.sleep(2)

select = Select(driver.find_element(By.NAME,'poll'))
pollutants=select.options

pollutant_names=[]
for p in pollutants:
    pollutant_names.append(p.text)

p_index=pollutant_names.index(selected_pollutant)

for p in range(p_index,p_index+1):

    select = Select(driver.find_element(By.NAME,'poll'))
    select.select_by_index(p)
    pollutant=select.first_selected_option.text
    time.sleep(2)

    select = Select(driver.find_element(By.NAME,'year'))
    years=select.options

    print(f'Data available from {years[2].text} to {years[-1].text}')

    for y in range(2+len(yr),len(years)):

        select = Select(driver.find_element(By.NAME,'year'))
        select.select_by_index(y)
        year=select.first_selected_option.text
        time.sleep(2)

        select = Select(driver.find_element(By.NAME,'state'))
        states=select.options

        state_names=[]
        for s in states:
            state_names.append(s.text)

        s_index=state_names.index(selected_state)

        for s in range(s_index,s_index+1):

            select = Select(driver.find_element(By.NAME,'state'))
            select.select_by_index(s)
            state=select.first_selected_option.text
            time.sleep(2)

            driver.find_element(By.XPATH,'//*[@id="launch"]/input').click()  
            time.sleep(20)
            driver.find_element(By.PARTIAL_LINK_TEXT,'CSV').click()
            time.sleep(45)

            name=f'{pollutant}-{state}-{year}'
            shutil.move(f'../../Downloads/ad_viz_plotval_data.csv',f'DataAQ/{selected_pollutant}-{selected_state}/{name}')
            print(f'{pollutant}-{year}-{state}')

Data available from 2022 to 1980
NO2-1993-California
NO2-1992-California
NO2-1991-California
NO2-1990-California
NO2-1989-California
NO2-1988-California
NO2-1987-California
NO2-1986-California
NO2-1985-California
NO2-1984-California
NO2-1983-California
NO2-1982-California
NO2-1981-California
NO2-1980-California


#### In the next notebook, we will 
1. Perform data cleaning operations on selected datasets
2. Visualize the overall time series data
3. Use seasonal decomposition methods to extract the trend and seasonality from the data