# Part 1: Gathering Data


In [67]:
from pathlib import Path
import time

import requests
import pandas as pd

def fetch_and_cache(data_url, file, data_dir="data", force=False):
    """
    Download and cache a url and return the file object.

    data_url: the web address to download
    file: the file in which to save the results.
    data_dir: (default="data") the location to save the data
    force: if true the file is always re-downloaded

    return: The pathlib.Path object representing the file.
    """

    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok = True)
    file_path = data_dir / Path(file)
    # If the file already exists and we want to force a download then
    # delete the file first so that the creation date is correct.
    if force and file_path.exists():
        file_path.unlink()
    if force or not file_path.exists():
        print('Downloading...', end=' ')
        resp = requests.get(data_url)
        with file_path.open('wb') as f:
            f.write(resp.content)
        print('Done!')
        last_modified_time = time.ctime(file_path.stat().st_mtime)
    else:
        last_modified_time = time.ctime(file_path.stat().st_mtime)
        print("Using cached version that was downloaded (UTC):", last_modified_time)
    return file_path

def fetch_and_cache_gdrive(gdrive_id, file, data_dir="data", force=False):
    """
    Download and cache a url and return the file object.

    data_url: the web address to download
    file: the file in which to save the results.
    data_dir: (default="data") the location to save the data
    force: if true the file is always re-downloaded

    return: The pathlib.Path object representing the file.
    """

    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok = True)
    file_path = data_dir / Path(file)
    # If the file already exists and we want to force a download then
    # delete the file first so that the creation date is correct.
    if force and file_path.exists():
        file_path.unlink()
    if force or not file_path.exists():
        print('Downloading...', end=' ')
        download_file_from_google_drive(gdrive_id, file_path)
        print('Done!')
        last_modified_time = time.ctime(file_path.stat().st_mtime)
    else:
        last_modified_time = time.ctime(file_path.stat().st_mtime)
        print("Using cached version that was downloaded (UTC):", last_modified_time)
    return file_path



# https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)

    print(response)
    needs_confirm = needs_confirmation(response)

    if needs_confirm:
        params = { 'id' : id, 'confirm' : 't' ,}
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

def needs_confirmation(response):

    return response.headers["Content-Type"] == 'text/html; charset=utf-8'

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)


In [68]:
# download required datasets

file_dict = {
    "aqs_sites.csv": "1fMfkw-NJ03VrQxYpDjM_4T6VDaWhvegi",
    "greenhouse_gas_emmitter_facilities.csv": "1yjTRv1OrsdWk-xNW4ZbFYB7_8Tt_x_fV",
    "greenhouse_gas_emmiter_gas_types.csv":"1akEokx_wqsgYqGNnNJsgebA6DDGHJLR2",
    "2020_daily_global_weather.csv":"15pjzsCiIE0uL69a4tZYgXoog8TKjIJNJ",
    "daily_wind_2020.csv": "18MqsjkN0EYPuLb0iR0U9sZYmNVCdd94h",
    "daily_temperature_2020.csv": "1Q62JlGtc65L2eU9FfQZcP9UyMCdcem1T",
    "traffic_volumes.csv": "1sZyjZSWz1xEoB26u_OrhKokZYhTIJRMD",
    "county_aqi_2020_daily.csv": "1uNH90XRceOfb16ctuUsYXVIEDeR2yaeC"
    }

for k, v in file_dict.items():
    path = f"./datasets/{k}"
    if not os.path.exists(path):
        download_file_from_google_drive(v,path)



<Response [200]>


## Creating Dataframes

In [71]:
aqs_site_df = pd.read_csv(f"./datasets/aqs_sites.csv")
gh_gas_facil_df = pd.read_csv(f"./datasets/greenhouse_gas_emmitter_facilities.csv")
gh_gas_type_df = pd.read_csv(f"./datasets/greenhouse_gas_emmiter_gas_types.csv")
weather_2020_df = pd.read_csv(f"./datasets/2020_daily_global_weather.csv")
wind_2020_df = pd.read_csv(f"./datasets/daily_wind_2020.csv")
temp_2020_df = pd.read_csv(f"./datasets/daily_temperature_2020.csv")
traffic_volumes_df = pd.read_csv(f"./datasets/traffic_volumes.csv")
aqi_2020 = pd.read_csv(f"./datasets/county_aqi_2020_daily.csv")

  gh_gas_facil_df = pd.read_csv(f"./datasets/greenhouse_gas_emmitter_facilities.csv")


### aqs site data

In [72]:
print(aqs_site_df.columns)
display(aqs_site_df.sample(10))

Index(['State Code', 'County Code', 'Site Number', 'Latitude', 'Longitude',
       'Datum', 'Elevation', 'Land Use', 'Location Setting',
       'Site Established Date', 'Site Closed Date', 'Met Site State Code',
       'Met Site County Code', 'Met Site Site Number', 'Met Site Type',
       'Met Site Distance', 'Met Site Direction', 'GMT Offset',
       'Owning Agency', 'Local Site Name', 'Address', 'Zip Code', 'State Name',
       'County Name', 'City Name', 'CBSA Name', 'Tribe Name',
       'Extraction Date'],
      dtype='object')


Unnamed: 0,State Code,County Code,Site Number,Latitude,Longitude,Datum,Elevation,Land Use,Location Setting,Site Established Date,...,Owning Agency,Local Site Name,Address,Zip Code,State Name,County Name,City Name,CBSA Name,Tribe Name,Extraction Date
2837,11,1,17,38.903723,-77.051366,WGS84,20.0,COMMERCIAL,URBAN AND CENTER CITY,1974-12-06,...,"Department of Energy & Environment, District o...",,WEST END LIBRARY 24 & L STS. NW,,District Of Columbia,District of Columbia,Washington,"Washington-Arlington-Alexandria, DC-VA-MD-WV",,2021-05-18
7419,23,13,7,44.103008,-69.132993,WGS84,28.0,RESIDENTIAL,SUBURBAN,1978-01-01,...,"Maine D.E.P. Bureau Of Air Quality Control, Au...",,SMALL'S MEAT MKT.-PARK ST.,4841.0,Maine,Knox,Rockland,"Rockland, ME",,2021-05-18
1181,6,19,1003,37.112447,-119.313459,WGS84,1707.0,UNKNOWN,RURAL,1979-01-01,...,Fresno County APCD,,"3/4 MI E. HWY 168, SHAVER LAKE",,California,Fresno,Shaver Lake,"Fresno, CA",,2021-05-18
10087,29,189,2001,38.724215,-90.343992,WGS84,187.0,RESIDENTIAL,SUBURBAN,1972-01-01,...,St Louis County Health Department Air Pollutio...,,8811 HAROLD DRIVE,63134.0,Missouri,Saint Louis,Berkeley,"St. Louis, MO-IL",,2021-05-18
19240,54,9,6000,40.240877,-80.65033,NAD83,201.0,RESIDENTIAL,SUBURBAN,2011-01-01,...,"Shell Engineering & Assoc., MO",TRAILER SALES SITE,"STATE TOUTE 2, BOX27A, BEECH BOTTOM, WVA",26070.0,West Virginia,Brooke,Beech Bottom,"Weirton-Steubenville, WV-OH",,2021-05-18
9805,29,77,2,37.2334,-93.383529,WGS84,411.0,,,1967-01-01,...,Missouri Dept Of Natural Resources,,DOWNTOWN AIRPORT,65802.0,Missouri,Greene,Springfield,"Springfield, MO",,2021-05-18
2255,8,41,6004,38.921382,-104.813032,WGS84,1931.0,RESIDENTIAL,URBAN AND CENTER CITY,1988-01-01,...,City of Colorado Springs,OPEN FIELD NEAR RESIDENTIAL AREA AT 6000 PULPI...,6000 PULPIT ROCK DRIVE.,,Colorado,El Paso,Colorado Springs,"Colorado Springs, CO",,2021-05-18
16904,47,145,1006,35.873985,-84.574643,WGS84,244.0,AGRICULTURAL,RURAL,1973-01-01,...,Tennessee Division Of Air Pollution Control,,3.6 MI WSW OF KINGSTON STEAM PLANT,37748.0,Tennessee,Roane,Midtown,"Knoxville, TN",,2021-05-18
10156,29,510,42,38.59505,-90.286212,WGS84,163.0,RESIDENTIAL,URBAN AND CENTER CITY,1969-01-01,...,Missouri Dept Of Natural Resources,,ALLEY S OF THOLOZAN 231 FT W OF REGAL,63109.0,Missouri,St. Louis City,St. Louis,"St. Louis, MO-IL",,2021-05-18
8149,25,17,4002,0.0,0.0,NAD27,0.0,RESIDENTIAL,URBAN AND CENTER CITY,1965-01-01,...,Mass Dept Environmental Protection-Div Air Qua...,,"EAST SOMERVILLE LIBRARY, 115 BROADWAY",,Massachusetts,Middlesex,Somerville,"Boston-Cambridge-Newton, MA-NH",,2021-05-18


### gh gas facility data

In [23]:
print(gh_gas_facil_df.columns)
display(gh_gas_facil_df.sample(10))

Index(['V_GHG_EMITTER_FACILITIES.ADDRESS1',
       'V_GHG_EMITTER_FACILITIES.ADDRESS2',
       'V_GHG_EMITTER_FACILITIES.CEMS_USED', 'V_GHG_EMITTER_FACILITIES.CITY',
       'V_GHG_EMITTER_FACILITIES.COUNTY',
       'V_GHG_EMITTER_FACILITIES.COUNTY_FIPS',
       'V_GHG_EMITTER_FACILITIES.FACILITY_ID',
       'V_GHG_EMITTER_FACILITIES.LATITUDE',
       'V_GHG_EMITTER_FACILITIES.LONGITUDE',
       'V_GHG_EMITTER_FACILITIES.PRIMARY_NAICS_CODE',
       'V_GHG_EMITTER_FACILITIES.STATE', 'V_GHG_EMITTER_FACILITIES.STATE_NAME',
       'V_GHG_EMITTER_FACILITIES.YEAR', 'V_GHG_EMITTER_FACILITIES.ZIP',
       'V_GHG_EMITTER_FACILITIES.FACILITY_NAME',
       'V_GHG_EMITTER_FACILITIES.SECONDARY_NAICS_CODE',
       'V_GHG_EMITTER_FACILITIES.ADDITIONAL_NAICS_CODES',
       'V_GHG_EMITTER_FACILITIES.COGENERATION_UNIT_EMISS_IND',
       'V_GHG_EMITTER_FACILITIES.EPA_VERIFIED',
       'V_GHG_EMITTER_FACILITIES.PARENT_COMPANY',
       'V_GHG_EMITTER_FACILITIES.PLANT_CODE_INDICATOR'],
      dtype='object')


Unnamed: 0,V_GHG_EMITTER_FACILITIES.ADDRESS1,V_GHG_EMITTER_FACILITIES.ADDRESS2,V_GHG_EMITTER_FACILITIES.CEMS_USED,V_GHG_EMITTER_FACILITIES.CITY,V_GHG_EMITTER_FACILITIES.COUNTY,V_GHG_EMITTER_FACILITIES.COUNTY_FIPS,V_GHG_EMITTER_FACILITIES.FACILITY_ID,V_GHG_EMITTER_FACILITIES.LATITUDE,V_GHG_EMITTER_FACILITIES.LONGITUDE,V_GHG_EMITTER_FACILITIES.PRIMARY_NAICS_CODE,...,V_GHG_EMITTER_FACILITIES.STATE_NAME,V_GHG_EMITTER_FACILITIES.YEAR,V_GHG_EMITTER_FACILITIES.ZIP,V_GHG_EMITTER_FACILITIES.FACILITY_NAME,V_GHG_EMITTER_FACILITIES.SECONDARY_NAICS_CODE,V_GHG_EMITTER_FACILITIES.ADDITIONAL_NAICS_CODES,V_GHG_EMITTER_FACILITIES.COGENERATION_UNIT_EMISS_IND,V_GHG_EMITTER_FACILITIES.EPA_VERIFIED,V_GHG_EMITTER_FACILITIES.PARENT_COMPANY,V_GHG_EMITTER_FACILITIES.PLANT_CODE_INDICATOR
23622,216 Oakley Pebble Road,,,Owingsville,BATH COUNTY,21011.0,1002586.0,38.233848,-83.716902,486210.0,...,KENTUCKY,2014.0,40360.0,Owingsville,,,N,,TEXAS EASTERN TRANSMISSION L.P. (100%),N
33979,851 ROBISON ROAD EAST,,,ERIE,ERIE,42049.0,1007649.0,42.059457,-80.014484,562212.0,...,PENNSYLVANIA,2011.0,16509.0,LAKE VIEW LDFL,,,N,,WASTE MANAGEMENT INC. (100%),
13916,1990 TOMOKA FARMS RD,,,PORT ORANGE,VOLUSIA COUNTY,12127.0,1005328.0,29.1314,-81.0984,562212.0,...,FLORIDA,2015.0,32128.0,VOLUSIA SOLID WASTE MANAGEMENT DIVISION,,,N,,VOLUSIA COUNTY BOARD OF COUNTY COMMISSIONERS (...,N
45349,13000 Bay Park Road,,,Pasadena,HARRIS COUNTY,48201.0,1003570.0,29.64174,-95.0654,325188.0,...,TEXAS,2010.0,77507.0,Albemarle Corporation Bayport Plant,,,N,,ALBEMARLE CORPORATION (100%),
1152,11203 South River Road,,,Taylor,TAYLOR COUNTY,55119.0,1012924.0,45.14727,-90.46916,212322.0,...,WISCONSIN,2018.0,54659.0,Hi-Crush Blair LLC,,,N,,HI-CRUSH PARTNERS LP (100%),N
46901,,,,Roaring Springs,DICKENS COUNTY,48125.0,1013208.0,33.7783,-100.8766,221121.0,...,TEXAS,2019.0,79220.0,Cottonwood Substation,,,N,,WETT HOLDINGS LLC (100%),N
48536,,,,Barstow,WARD COUNTY,48475.0,1005215.0,31.5225,-103.465278,211112.0,...,TEXAS,2017.0,79719.0,Mivida Treater Plant,,,N,,ENERGY TRANSFER PARTNERS LP (100%),N
69671,600 A ST,,,DIBOLL,ANGELINA COUNTY,48005.0,1002387.0,31.191713,-94.788946,321219.0,...,TEXAS,2014.0,75941.0,DIBOLL COMPLEX,321113.0,,N,,KOCH INDUSTRIES INC (100%),N
70106,10300 South Kent Drive SW,,,BYRON CENTER,KENT COUNTY,26081.0,1003464.0,42.775556,-85.678889,562212.0,...,MICHIGAN,2015.0,49315.0,KENT COUNTY DPW SOUTH KENT LANDFILL,,,N,,KENT COUNTY MICHIGAN DEPARTMENT OF PUBLIC WORK...,N
54384,54741 TESORO ROAD,,,KENAI,KENAI PENINSULA BOROUGH,2122.0,1007741.0,60.683603,-151.367204,324110.0,...,ALASKA,2013.0,99611.0,TESORO ALASKA PETROLEUM CO,,,Y,Y,TESORO CORP (100%),N


### gh gas type data

In [24]:
print(gh_gas_type_df.columns)
display(gh_gas_type_df.sample(10))

Index(['V_GHG_EMITTER_GAS.ADDRESS1', 'V_GHG_EMITTER_GAS.ADDRESS2',
       'V_GHG_EMITTER_GAS.CITY', 'V_GHG_EMITTER_GAS.CO2E_EMISSION',
       'V_GHG_EMITTER_GAS.COUNTY', 'V_GHG_EMITTER_GAS.FACILITY_ID',
       'V_GHG_EMITTER_GAS.GAS_CODE', 'V_GHG_EMITTER_GAS.GAS_NAME',
       'V_GHG_EMITTER_GAS.LATITUDE', 'V_GHG_EMITTER_GAS.LONGITUDE',
       'V_GHG_EMITTER_GAS.STATE', 'V_GHG_EMITTER_GAS.STATE_NAME',
       'V_GHG_EMITTER_GAS.YEAR', 'V_GHG_EMITTER_GAS.ZIP',
       'V_GHG_EMITTER_GAS.FACILITY_NAME', 'V_GHG_EMITTER_GAS.COUNTY_FIPS'],
      dtype='object')


Unnamed: 0,V_GHG_EMITTER_GAS.ADDRESS1,V_GHG_EMITTER_GAS.ADDRESS2,V_GHG_EMITTER_GAS.CITY,V_GHG_EMITTER_GAS.CO2E_EMISSION,V_GHG_EMITTER_GAS.COUNTY,V_GHG_EMITTER_GAS.FACILITY_ID,V_GHG_EMITTER_GAS.GAS_CODE,V_GHG_EMITTER_GAS.GAS_NAME,V_GHG_EMITTER_GAS.LATITUDE,V_GHG_EMITTER_GAS.LONGITUDE,V_GHG_EMITTER_GAS.STATE,V_GHG_EMITTER_GAS.STATE_NAME,V_GHG_EMITTER_GAS.YEAR,V_GHG_EMITTER_GAS.ZIP,V_GHG_EMITTER_GAS.FACILITY_NAME,V_GHG_EMITTER_GAS.COUNTY_FIPS
89987,10340 68TH STREET NORTHWEST,,TIOGA,3240.5,WILLIAMS COUNTY,1001894,CH4,Methane,48.40152,-102.91418,ND,NORTH DAKOTA,2016,58852,TIOGA GAS PROCESSING PLANT,38105.0
113873,3149 LOUISIANA HWY 10,,WASHINGTON,49919.6,ST. LANDRY PARISH,1007048,CO2,Carbon Dioxide,30.671703,-92.125385,LA,LOUISIANA,2019,70589,TRANSCO STATION 54,22097.0
17935,1795 BURT ST,,BEAUMONT,1845.0,Jefferson,1007959,CH4,Methane,30.0639,-94.0703,TX,TEXAS,2011,77701,Exxonmobil Beaumont Refinery,48245.0
206172,35863 FAIRVIEW RD,,HINKLEY,58875.3,SAN BERNARDINO COUNTY,1004272,CO2,Carbon Dioxide,34.902694,-117.160594,CA,CALIFORNIA,2015,92347,PG&E HINKLEY COMPRESSOR STATION,6071.0
94487,300 INTERNATIONAL BLVD.,,CLARKSVILLE,25.33,MONTGOMERY COUNTY,1005263,N2O,Nitrous Oxide,36.60373,-87.25869,TN,TENNESSEE,2018,37040,FLORIM USA INC,47125.0
48930,,,ROCK SPRINGS,27034.5,SWEETWATER COUNTY,1005989,CH4,Methane,41.5222,-109.3128,WY,WYOMING,2016,82901,Dominion Energy Questar - Rock Springs Station...,56037.0
22376,,,RIDLEY PARK,22933.1,DELAWARE COUNTY,1000638,CO2,Carbon Dioxide,39.862773,-75.321425,PA,PENNSYLVANIA,2015,19078,BOEING HELICOPTER DIV,42045.0
151785,1250 West Maricopa Highway,,Casa Grande,14.0,PINAL COUNTY,1000378,CH4,Methane,32.894,-111.78312,AZ,ARIZONA,2018,85193,Abbott Laboratories,4021.0
203061,1 WAREHOUSE ROAD,,COLSTRIP,37283.0,Rosebud,1001020,CH4,Methane,45.8831,-106.614,MT,MONTANA,2013,59323,Colstrip,30087.0
83598,,,Offshore,12.218,,1011597,N2O,Nitrous Oxide,27.835135,-96.013057,TX,TEXAS,2019,0,BA A 133 B C-AUX E (Complex ID # 10249),


### weather 2020 data 

In [74]:
print(weather_2020_df.columns)
display(weather_2020_df.sample(10))

Index(['Unnamed: 0', 'Station', 'Date', 'TAVG', 'Latitude', 'Longitude',
       'Elevation', 'PRCP'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Station,Date,TAVG,Latitude,Longitude,Elevation,PRCP
92938,92938,MOM00060230,2020-01-26,113.0,31.607,-8.036,467.9,0.0
7509,7509,CA003011815,2020-01-03,-96.0,51.9333,-110.7167,772.0,0.0
975267,975267,USS0021C35S,2020-09-26,42.0,46.78,-121.75,1563.6,25.0
324614,324614,RSM00031026,2020-03-29,-42.0,58.73,130.62,194.0,13.0
52388,52388,JA000047897,2020-01-15,67.0,32.917,132.7,11.0,0.0
1031051,1031051,USS0007M05S,2020-10-12,78.0,37.65,-107.81,2706.6,0.0
522549,522549,USS0010D39S,2020-05-23,16.0,45.5,-110.08,1930.9,381.0
1051534,1051534,RSM00030028,2020-10-18,21.0,59.28,106.17,350.0,30.0
195036,195036,ASN00021133,2020-02-23,251.0,-33.7676,138.2182,109.1,0.0
944665,944665,CA008103050,2020-09-18,96.0,48.0167,-64.5,4.0,0.0


### wind 2020 data

In [75]:
print(wind_2020_df.columns)
display(wind_2020_df.sample(10))

Index(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC',
       'Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Sample Duration',
       'Pollutant Standard', 'Date Local', 'Units of Measure', 'Event Type',
       'Observation Count', 'Observation Percent', 'Arithmetic Mean',
       '1st Max Value', '1st Max Hour', 'AQI', 'Method Code', 'Method Name',
       'Local Site Name', 'Address', 'State Name', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change'],
      dtype='object')


Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,...,AQI,Method Code,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change
444822,48,493,1038,61104,1,29.1307,-98.1481,NAD83,Wind Direction - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Floresville Hospital Boulevard,1404 Hospital Blvd,Texas,Wilson,Floresville,"San Antonio-New Braunfels, TX",2021-03-29
104288,6,73,1026,61103,1,32.710177,-117.142665,NAD83,Wind Speed - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,San Diego - Sherman Elementary School,450B 24th Street,California,San Diego,San Diego,"San Diego-Carlsbad, CA",2021-04-18
48349,6,27,22,61104,1,36.326172,-117.95512,WGS84,Wind Direction - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Dirty Socks,"DIRTY SOCKS, HWY 190",California,Inyo,Not in a city,"Bishop, CA",2021-03-24
329307,40,37,144,61103,2,36.105481,-96.361196,WGS84,Wind Speed - Resultant,1 HOUR,...,,65,Instrumental - RM Young Model 05305,MANNFORD,MANNFORD WATER PLANT,Oklahoma,Creek,Mannford,"Tulsa, OK",2021-02-10
85129,6,65,1004,61103,1,33.9397,-115.4108,WGS84,Wind Speed - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Joshua Tree NP - Pinto Wells,Joshua Tree National Park - Pinto Wells,California,Riverside,Joshua Tree National Monument,"Riverside-San Bernardino-Ontario, CA",2021-02-05
386593,48,141,37,61104,1,31.768286,-106.501243,NAD83,Wind Direction - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,El Paso UTEP,250 Rim Rd,Texas,El Paso,El Paso,"El Paso, TX",2021-03-19
424611,48,349,1081,61103,1,31.9041,-96.352,NAD83,Wind Speed - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Richland Southeast 1220 Road,Southeast 1220 Road,Texas,Navarro,Richland,"Corsicana, TX",2021-03-19
243353,26,5,3,61103,1,42.767786,-86.148577,WGS84,Wind Speed - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Holland,966 W. 32ND (HOLLAND),Michigan,Allegan,Holland,"Holland, MI",2021-03-01
384361,48,139,16,61104,1,32.482083,-97.026899,WGS84,Wind Direction - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Midlothian OFW,2725 Old Fort Worth Road,Texas,Ellis,Midlothian,"Dallas-Fort Worth-Arlington, TX",2021-03-19
145450,8,69,7,61103,1,40.27813,-105.54564,WGS84,Wind Speed - Resultant,1 HOUR,...,,20,INSTRUMENTAL - VECTOR SUMMATION,Rocky Mountain NP - Long's Peak,ROCKY MOUNTAIN NP,Colorado,Larimer,Not in a city,"Fort Collins, CO",2021-02-16


### temp 2020 data

In [76]:
print(temp_2020_df.columns)
display(temp_2020_df.sample(10))

Index(['State Code', 'County Code', 'Site Num', 'Parameter Code', 'POC',
       'Latitude', 'Longitude', 'Datum', 'Parameter Name', 'Sample Duration',
       'Pollutant Standard', 'Date Local', 'Units of Measure', 'Event Type',
       'Observation Count', 'Observation Percent', 'Arithmetic Mean',
       '1st Max Value', '1st Max Hour', 'AQI', 'Method Code', 'Method Name',
       'Local Site Name', 'Address', 'State Name', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change'],
      dtype='object')


Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,...,AQI,Method Code,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change
282136,56,37,300,62101,1,41.750556,-109.788333,WGS84,Outdoor Temperature,1 HOUR,...,,41,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 1,Moxa Arch,Moxa,Wyoming,Sweetwater,Not in a city,"Rock Springs, WY",2021-03-01
180712,40,143,9029,62101,1,35.901324,-96.012749,NAD83,Outdoor Temperature,1 HOUR,...,,40,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.,Onestop Fuel,"1800 W 181st St S, Mounds, OK 74047",Oklahoma,Tulsa,Glenpool,"Tulsa, OK",2021-01-08
98619,19,139,19,62101,1,41.401459,-91.068449,WGS84,Outdoor Temperature,1 HOUR,...,,40,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.,Muscatine HS - East Campus School Trailer,"1409 Wisconsin St, Muscatine IA",Iowa,Muscatine,Muscatine,"Muscatine, IA",2021-01-12
77280,9,9,9002,62101,1,41.256788,-72.55327,NAD83,Outdoor Temperature,1 HOUR,...,,41,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 1,Hammonasset State Park,"Meigs Point, Hammonasset State Park",Connecticut,New Haven,Madison (Town of),"New Haven-Milford, CT",2021-03-30
233710,48,479,17,62101,1,27.501826,-99.502984,WGS84,Outdoor Temperature,1 HOUR,...,,40,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.,Laredo Bridge,700 Zaragosa St,Texas,Webb,Laredo,"Laredo, TX",2021-03-29
239509,49,37,101,62101,1,38.45832,-109.82126,WGS84,Outdoor Temperature,1 HOUR,...,,41,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 1,Canyonlands NP - Island in the Sky,"CANYONLANDS NATIONAL PARK, UTAH",Utah,San Juan,Not in a city,,2021-02-16
275172,56,25,100,62101,2,42.82231,-106.36501,WGS84,Outdoor Temperature,1 HOUR,...,,41,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 1,Casper Gaseous,"2800 Pheasant Drive, Casper",Wyoming,Natrona,Casper,"Casper, WY",2021-02-25
59132,6,103,4,62101,2,40.262072,-122.092766,WGS84,Outdoor Temperature,1 HOUR,...,,59,Instrumental - Vaisala HMP 155,Tuscan Butte (seasonal),OLD FIRE LOOKOUT ATOP TUSCAN BUTTE,California,Tehama,Not in a city,"Red Bluff, CA",2021-03-08
263797,56,5,1115,62101,1,43.701242,-105.292881,WGS84,Outdoor Temperature,1 HOUR,...,,40,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.,Black Thunder Admin Met Station,Black Thunder Met Station,Wyoming,Campbell,Not in a city,"Gillette, WY",2021-03-04
233320,48,479,16,62101,1,27.517456,-99.515222,NAD83,Outdoor Temperature,1 HOUR,...,,40,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.,Laredo Vidaurri,2020 Vidaurri Ave,Texas,Webb,Laredo,"Laredo, TX",2021-03-29


### traffic volumes data

In [29]:
print(traffic_volumes_df.columns)
display(traffic_volumes_df.sample(10))

Index(['OBJECTID_1', 'OBJECTID', 'District', 'Route', 'Rte_SFX', 'County',
       'PM_PFX', 'Postmile', 'PM_SFX', 'Descriptn', 'Back_pk_h', 'Back_pk_m',
       'Back_AADT', 'Ahead_pk_h', 'Ahead_pk_m', 'Ahead_AADT', 'Lon_S_or_W',
       'Lat_S_or_W', 'Lon_N_or_E', 'Lat_N_or_E'],
      dtype='object')


Unnamed: 0,OBJECTID_1,OBJECTID,District,Route,Rte_SFX,County,PM_PFX,Postmile,PM_SFX,Descriptn,Back_pk_h,Back_pk_m,Back_AADT,Ahead_pk_h,Ahead_pk_m,Ahead_AADT,Lon_S_or_W,Lat_S_or_W,Lon_N_or_E,Lat_N_or_E
2627,2628,2628,4,680,,CC,R,11.28,,LIVORNA ROAD,12900.0,171000.0,165000.0,12700,166000,161000,-122.0352282,37.86303388,-122.0350278,37.863146
6828,6829,6829,11,905,,SD,,5.164,,JCT. RTE. 805,6500.0,69000.0,67000.0,8700,105000,100000,-117.040511,32.56840146,-117.0406139,32.56806908
5859,5860,5860,10,59,,MER,,19.0,,BELLEVUE RD,480.0,5500.0,3800.0,420,3750,3700,-120.5033718,37.36095488,-120.5033718,37.36095488
1361,1362,1362,3,99,,SUT,,26.12,,BARRY ROAD,1900.0,22300.0,21200.0,2100,23600,22600,-121.6349389,39.0763065,-121.6348172,39.07628175
5950,5951,5951,10,99,,SJ,,0.0,,STANISLAUS/SAN JOAQUIN COUNTY LINE,,,,9200,119000,114200,-121.1100833,37.73032136,-121.1098951,37.73038018
3634,3635,3635,6,145,,FRE,,13.212,,JCT. RTE. 269,600.0,5500.0,4800.0,680,6700,5300,-120.103086,36.429652,-120.103086,36.429652
867,868,868,3,20,,COL,,30.639,,"COLUSA, FREMONT STREET",680.0,6800.0,6700.0,680,7000,6900,-122.0171672,39.20931187,-122.0171672,39.20931187
1149,1150,1150,3,65,,PLA,R,17.446,,NICOLAUS ROAD,2100.0,25000.0,23600.0,2100,25000,23600,-121.3686296,38.89724937,-121.3682534,38.89724227
5955,5956,5956,10,99,,SJ,,6.654,,"MANTECA, NORTH JCT. RTE. 120",7600.0,91000.0,83000.0,8400,93000,92500,-121.1913311,37.7973453,-121.1911279,37.79741574
1079,1080,1080,3,50,,SAC,,17.008,,FOLSOM BOULEVARD/NATOMA,10300.0,134000.0,125400.0,8800,100000,95000,-121.1972109,38.63996841,-121.1973718,38.63978433


### aqi data 2020

In [31]:
print(aqi_2020.columns)
display(aqi_2020.sample(10))

Index(['State Name', 'county Name', 'State Code', 'County Code', 'Date', 'AQI',
       'Category', 'Defining Parameter', 'Defining Site',
       'Number of Sites Reporting'],
      dtype='object')


Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
317414,Washington,Yakima,53,77,2020-12-05,80,Moderate,PM2.5,53-077-0005,4
310087,Washington,Garfield,53,23,2020-03-12,23,Good,PM2.5,53-023-0001,1
308109,Washington,Chelan,53,7,2020-09-15,295,Very Unhealthy,PM2.5,53-007-0007,4
224391,Ohio,Wood,39,173,2020-06-01,41,Good,Ozone,39-173-0003,1
59583,Georgia,Bibb,13,21,2020-06-24,29,Good,Ozone,13-021-0012,1
319160,West Virginia,Hancock,54,29,2020-07-13,36,Good,Ozone,54-029-0009,3
320984,West Virginia,Monongalia,54,61,2020-09-04,37,Good,Ozone,54-061-0003,1
205786,North Carolina,Rowan,37,159,2020-02-03,42,Good,Ozone,37-159-0021,1
191447,New York,Putnam,36,79,2020-07-22,61,Moderate,Ozone,36-079-0005,1
262486,South Dakota,Brown,46,13,2020-02-13,43,Good,PM2.5,46-013-0004,1


## Data wrangling

### decisions log
- we are going to only use data for the state of california because we are interested in using vehicle emissions data which we have for the state of california in addition to the factory emissions data 

In [77]:
california_aqi_df = aqi_2020[aqi_2020["State Name"] == "California"]

print(california_aqi_df.shape)
display(california_aqi_df)


(19225, 10)


Unnamed: 0,State Name,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting
14003,California,Alameda,6,1,2020-01-01,53,Moderate,PM2.5,06-001-0009,7
14004,California,Alameda,6,1,2020-01-02,43,Good,PM2.5,06-001-0013,7
14005,California,Alameda,6,1,2020-01-03,74,Moderate,PM2.5,06-001-0013,7
14006,California,Alameda,6,1,2020-01-04,45,Good,PM2.5,06-001-0007,7
14007,California,Alameda,6,1,2020-01-05,33,Good,PM2.5,06-001-0007,7
...,...,...,...,...,...,...,...,...,...,...
33223,California,Yolo,6,113,2020-12-27,20,Good,Ozone,06-113-0004,2
33224,California,Yolo,6,113,2020-12-28,33,Good,Ozone,06-113-0004,2
33225,California,Yolo,6,113,2020-12-29,28,Good,Ozone,06-113-0004,3
33226,California,Yolo,6,113,2020-12-30,39,Good,PM2.5,06-113-0004,2
