In [17]:
import pandas as pd
import numpy as np
import os
import datetime
from scipy.spatial.distance import cdist
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree


In [18]:
def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

In [19]:
def match_value(df, col1, x, col2):
    """ Match value x from col1 row to value in col2. """
    return df[df[col1] == x][col2].values[0]

In [20]:
#Fixed Months, no need to run this
Air_Qual_1 = pd.read_csv('../../../../data/us/air_quality/1.tgz', compression='gzip')                 
Air_Qual_2 = pd.read_csv('../../../../data/us/air_quality/2.tgz', compression='gzip')                 
Air_Qual_3 = pd.read_csv('../../../../data/us/air_quality/3.tgz', compression='gzip')                 

In [21]:
#Air Data that Changes Weekly
Air_Qual_4 = pd.read_csv('../../../../data/us/air_quality/4.tgz', compression='gzip')                 

In [22]:
#Getting only US Air Data
Air_Qual_1 = Air_Qual_1[Air_Qual_1['CountryCode'] == 'US']
Air_Qual_2 = Air_Qual_2[Air_Qual_2['CountryCode'] == 'US']
Air_Qual_3 = Air_Qual_3[Air_Qual_3['CountryCode'] == 'US']
Air_Qual_4 = Air_Qual_4[Air_Qual_4['CountryCode'] == 'US']
#Dropping unneeded columns
Air_Qual_1 = Air_Qual_1.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_2 = Air_Qual_2.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_3 = Air_Qual_3.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_4 = Air_Qual_4.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])

Air_Qual_1 = Air_Qual_1.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_2 = Air_Qual_2.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_3 = Air_Qual_3.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_4 = Air_Qual_4.astype({'Latitude': 'float64','Longitude': 'float64'})

#joining data together
Air_Qual = Air_Qual_1.append(Air_Qual_2, ignore_index = True) 
Air_Qual = Air_Qual.append(Air_Qual_3, ignore_index = True) 
Air_Qual = Air_Qual.append(Air_Qual_4, ignore_index = True) 



In [23]:
#Loading in County_Centers Data to map Air Quality to FIPs code
County_Centers = pd.read_csv('../../../../data/us/geolocation/county_centers.csv')
County_Centers = County_Centers.drop(columns=['clon00','clat00','pclon00','pclat00','pclon10','pclat10'])

County_Centers.to_csv('County_Centers.csv')

In [24]:
#Creating GeoPandas DataFrames to do fast distance comparison
C_C = gpd.GeoDataFrame({ #County_center gpd
        'geometry': Point(a, b),
        'x': float(a),
        'y': float(b),
    } for a, b in zip(County_Centers['clat10'], County_Centers['clon10']))
A_Q = gpd.GeoDataFrame({  #Air_Quality gpd
        'geometry': Point(a, b),
        'x': float(a),
        'y': float(b),
    } for a, b in zip(Air_Qual['Latitude'], Air_Qual['Longitude']))

tree = BallTree(C_C[['x', 'y']].values, leaf_size=2) #distance tree

A_Q['distance_nearest'], A_Q['id_nearest'] = tree.query(
    A_Q[['x', 'y']].values, # The input array for the query
    k=1, # The number of nearest neighbors 
)
    
#Defining the fips code based on the 'id_nearest' column, the ID of the closest County_Center to each Air_Quality Report 
Air_Qual['FIPS'] = list(County_Centers.iloc[A_Q['id_nearest']]['fips'])

In [25]:
#Cleaning Air_Qual to split it up
Air_Qual = Air_Qual.drop(columns=['ValidTime','OZONE_Measured','PM10_Measured','PM25_Measured','NO2_Measured','PM25',\
                                  'PM25_Unit', 'OZONE','OZONE_Unit','NO2','NO2_Unit','PM10','PM10_Unit'])
Air_Qual.to_csv('Air_Qual.csv')

In [26]:
#Making individual Dataframes for each type of Gas particle collected
Ozone = Air_Qual.loc[Air_Qual['OZONE_AQI'].astype(float) >= -1]
Ozone = Ozone.drop(columns=['PM10_AQI','PM25_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
#Setting the datapoint to be a float
Ozone = Ozone.astype({'OZONE_AQI': 'float64'})
#Taking the average value of 'OZONE_AQI' for each FIPS code and Date
Ozone = Ozone.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'OZONE_AQI' values for each date
Ozone = Ozone.unstack()['OZONE_AQI']
Ozone.to_csv('Ozone_AQI.csv')

PM10 = Air_Qual.loc[Air_Qual['PM10_AQI'].astype(float) >= -1]
PM10 = PM10.drop(columns=['OZONE_AQI','PM25_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
PM10 = PM10.astype({'PM10_AQI': 'float64'})
#Taking the average value of 'PM10_AQI' for each FIPS code and Date
PM10 = PM10.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'PM10_AQI' values for each date
PM10 = PM10.unstack()['PM10_AQI']
PM10.to_csv('PM10_AQI.csv')

PM25 = Air_Qual.loc[Air_Qual['PM25_AQI'].astype(float) >= -1]
PM25 = PM25.drop(columns=['OZONE_AQI','PM10_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
PM25 = PM25.astype({'PM25_AQI': 'float64'})
#Taking the average value of 'PM25_AQI' for each FIPS code and Date
PM25 = PM25.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'PM25_AQI' values for each date
PM25 = PM25.unstack()['PM25_AQI']
PM25.to_csv('PM25_AQI.csv')

NO2 = Air_Qual.loc[Air_Qual['NO2_AQI'].astype(float) >= -1]
NO2 = NO2.drop(columns=['PM10_AQI','PM25_AQI','OZONE_AQI','CO','CO_Unit','SO2','SO2_Unit'])
NO2 = NO2.astype({'NO2_AQI': 'float64'})
#Taking the average value of 'NO2_AQI' for each FIPS code and Date
NO2 = NO2.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'NO2_AQI' values for each date
NO2 = NO2.unstack()['NO2_AQI']
NO2.to_csv('NO2_AQI.csv')

CO = Air_Qual.loc[Air_Qual['CO_Unit'].isin(['PPM','PPB'])]
CO = CO.drop(columns=['PM10_AQI','PM25_AQI','OZONE_AQI', 'NO2_AQI','SO2','SO2_Unit'])
CO = CO.astype({'CO': 'float64'})
#Scaling every measurement to be in terms of PPB, so multiply PPM*1000
CO.loc[CO['CO_Unit'] == 'PPM',['CO']] = CO.loc[CO['CO_Unit'] == 'PPM']['CO'].mul(1000).to_numpy()
#Drop unit column, everythin is PPB
CO = CO.drop(columns=['CO_Unit'])
#Taking the average value of 'CO_Unit' for each FIPS code and Date
CO = CO.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'CO_Unit' values for each date
CO = CO.unstack()['CO']
CO.to_csv('CO_PPB.csv')

SO2 = Air_Qual.loc[Air_Qual['SO2_Unit'].isin(['PPM','PPB'])]
SO2 = SO2.drop(columns=['PM10_AQI','PM25_AQI','OZONE_AQI', 'NO2_AQI', 'CO', 'CO_Unit'])
SO2 = SO2.astype({'SO2': 'float64'})
#Scaling every measurement to be in terms of PPB, so multiply PPM*1000
SO2.loc[SO2['SO2_Unit'] == 'PPM',['SO2']] = SO2.loc[SO2['SO2_Unit'] == 'PPM']['SO2'].mul(1000).to_numpy()
SO2 = SO2.drop(columns=['SO2_Unit'])
#Taking the average value of 'SO2_Unit' for each FIPS code and Date
SO2 = SO2.groupby(['FIPS','ValidDate']).mean()
#Isolating only the 'SO2_Unit' values for each date
SO2 = SO2.unstack()['SO2']
SO2.to_csv('SO2_PPB.csv')

In [27]:
print(len(Ozone))
print(len(PM10))
print(len(PM25))
print(len(CO))
print(len(SO2))

734
179
594
548
541


In [28]:
County_Centers.head()

Unnamed: 0,fips,clon10,clat10
0,1001,-86.64449,32.536382
1,1003,-87.746067,30.659218
2,1005,-85.405456,31.87067
3,1007,-87.127148,33.015893
4,1009,-86.567246,33.977448


In [29]:
list(Air_Qual.columns)

['Latitude',
 'Longitude',
 'ValidDate',
 'OZONE_AQI',
 'PM10_AQI',
 'PM25_AQI',
 'NO2_AQI',
 'CO',
 'CO_Unit',
 'SO2',
 'SO2_Unit',
 'FIPS']

In [30]:
Air_Qual.head()

Unnamed: 0,Latitude,Longitude,ValidDate,OZONE_AQI,PM10_AQI,PM25_AQI,NO2_AQI,CO,CO_Unit,SO2,SO2_Unit,FIPS
0,31.168889,-96.481944,01/01/20,,,,,,,,,48395
1,40.2778,-105.5453,01/01/20,,,,,,,,,8013
2,47.808226,-117.34327,01/01/20,,,10.0,,,,,,53063
3,47.663963,-117.257652,01/01/20,,,10.0,,,,,,53063
4,47.660568,-117.084503,01/01/20,,,,,,,,,53063


In [31]:
Ozone.head()

ValidDate,01/01/20,01/02/20,01/03/20,01/04/20,01/05/20,01/06/20,01/07/20,01/08/20,01/09/20,01/10/20,...,03/25/20,03/26/20,03/27/20,03/28/20,03/29/20,03/30/20,03/31/20,04/10/20,04/11/20,04/12/20
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1003,,,,,,,,,,,...,29.6875,29.375,30.9375,28.1875,29.625,35.1875,29.5625,29.875,34.9375,33.454545
1051,,,,,,,,,,,...,26.0625,20.5,20.375,19.4375,27.0625,20.125,19.375,35.9375,22.75,17.909091
1055,,,,,,,,,,,...,27.8125,19.8125,28.6875,23.25,32.1875,36.5,33.6875,41.8125,25.8125,21.909091
1073,19.625,25.0,16.4375,30.6875,18.75,9.2,,,34.5,32.375,...,26.3,27.8375,38.5,28.3,35.4125,32.8375,31.4625,38.5,30.0,30.890909
1089,,,,,,,,,,,...,25.15625,21.5625,38.5,29.6875,36.15625,30.5,33.65625,38.40625,27.21875,35.590909


In [32]:
SO2.head()

ValidDate,01/01/20,01/02/20,01/03/20,01/04/20,01/05/20,01/06/20,01/07/20,01/08/20,01/09/20,01/10/20,...,03/25/20,03/26/20,03/27/20,03/28/20,03/29/20,03/30/20,03/31/20,04/10/20,04/11/20,04/12/20
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1073,,,,,,,,,,,...,,,,,,,,,,
1097,1.066667,1.0,1.066667,1.0,0.466667,1.0,1.0,0.272727,0.933333,0.923077,...,,,,,,,,1.0,,
1103,,,,,,,,,,,...,,,,,,,,,,
1117,0.428571,0.583333,0.133333,0.0,0.066667,0.285714,0.0,7.153846,1.733333,0.0,...,,,,,,,,,,
2020,,,,,,,,,,,...,,,,,,,,,,100.0
