In [338]:
import pandas as pd
import numpy as np
import os
import datetime
from scipy.spatial.distance import cdist
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree
from geopy import distance

In [339]:
def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

In [340]:
def match_value(df, col1, x, col2):
    """ Match value x from col1 row to value in col2. """
    return df[df[col1] == x][col2].values[0]

In [341]:
def air_qual_county(fips, date, gas_data, colname, County_Centers, neighbors=5):
    if date == '01/01/20':
        print(fips)
    #Gets the coordinates of this fips code county
    Center = np.array(County_Centers[County_Centers['fips'] == fips][['pclat10','pclon10']].reset_index(drop=True).loc[0]).reshape(1,-1)
    #get only relevant date data
    gas_data = gas_data[gas_data['ValidDate'] == date]
    #Making geo dataframe to do distance calculations
    Geo = gpd.GeoDataFrame({
        'geometry': (a, b),
        'x': float(a),
        'y': float(b),
        'val': c,
    } for a, b, c in zip(gas_data['Latitude'], gas_data['Longitude'], gas_data[colname]))
    #distance tree
    if len(Geo) == 0:
        return 0
    tree = BallTree(Geo[['x','y']].values, leaf_size=2) 
    #Querying for the nearest k neighbors, we use 5 as default
    dist, ids = tree.query(Center, k = neighbors) # The number of nearest neighbors
    #list of the nearest neighbors to our county
    near = Geo.loc[ids.tolist()[0]]
    totalinvdist = 0
    weightedval = 0
    extrainvdist = 0
    extraweightedval = 0
    for ind in near.index:
        nearloc = near.loc[ind]['geometry']
        if abs(nearloc[0]) <= 90 and abs(nearloc[1]) <= 180:
            dist = distance.distance(Center, near.loc[ind]['geometry']).km
            #If possible we only want nearby influences, so we cap this at 100km
            if dist < 100:
                totalinvdist += (1/dist)**1
                weightedval += ((1/dist)**1)*near.loc[ind]['val']
            #If no closeby measurments are available, then further away data is used
            else: 
                extrainvdist += (1/dist)**1
                extraweightedval += ((1/dist)**1)*near.loc[ind]['val']
    if totalinvdist == 0:
        if extrainvdist == 0:
            return 0
        else:
            return extraweightedval/extrainvdist
    else:
        return weightedval/totalinvdist

In [342]:
#Fixed Months, no need to run this
Air_Qual_1 = pd.read_csv('../../../../data/us/air_quality/1.tgz', compression='gzip')                 
Air_Qual_2 = pd.read_csv('../../../../data/us/air_quality/2.tgz', compression='gzip')                 
Air_Qual_3 = pd.read_csv('../../../../data/us/air_quality/3.tgz', compression='gzip')                 

In [343]:
#Air Data that Changes Weekly
Air_Qual_4 = pd.read_csv('../../../../data/us/air_quality/4.tgz', compression='gzip')                 

In [344]:
#Getting only US Air Data
Air_Qual_1 = Air_Qual_1[Air_Qual_1['CountryCode'] == 'US']
Air_Qual_2 = Air_Qual_2[Air_Qual_2['CountryCode'] == 'US']
Air_Qual_3 = Air_Qual_3[Air_Qual_3['CountryCode'] == 'US']
Air_Qual_4 = Air_Qual_4[Air_Qual_4['CountryCode'] == 'US']
#Dropping unneeded columns
Air_Qual_1 = Air_Qual_1.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_2 = Air_Qual_2.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_3 = Air_Qual_3.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])
Air_Qual_4 = Air_Qual_4.drop(columns=['out/','SiteName','GMTOffset','CountryCode','StateName', 'Elevation', \
                                      'DataSource','ReportingArea_PipeDelimited','Status','EPARegion'])

Air_Qual_1 = Air_Qual_1.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_2 = Air_Qual_2.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_3 = Air_Qual_3.astype({'Latitude': 'float64','Longitude': 'float64'})
Air_Qual_4 = Air_Qual_4.astype({'Latitude': 'float64','Longitude': 'float64'})

#joining data together
Air_Qual = Air_Qual_1.append(Air_Qual_2, ignore_index = True) 
Air_Qual = Air_Qual.append(Air_Qual_3, ignore_index = True) 
Air_Qual = Air_Qual.append(Air_Qual_4, ignore_index = True) 



In [345]:
#Cleaning Air_Qual to split it up
Air_Qual = Air_Qual.drop(columns=['ValidTime','OZONE_Measured','PM10_Measured','PM25_Measured','NO2_Measured','PM25',\
                                  'PM25_Unit', 'OZONE','OZONE_Unit','NO2','NO2_Unit','PM10','PM10_Unit'])
Air_Qual.to_csv('Air_Qual.csv')

In [346]:
#Loading in County_Centers Data to map Air Quality to FIPs code
County_Centers = pd.read_csv('../../../../data/us/geolocation/county_centers.csv')
County_Centers = County_Centers.drop(columns=['clon00','clat00','pclon00','pclat00','clon10','clat10'])
#Removing NaN rows
County_Centers = County_Centers.dropna()
County_Centers.to_csv('County_Centers.csv')

In [347]:
#This is a list of all the counties and dates
County_List = list(County_Centers.fips.unique())
Date_List = list(Air_Qual.ValidDate.unique())
#This creates a base dataframe that contains all pairs of FIPS codes with the valid dates given in Air_Qual
CL, DL = pd.core.reshape.util.cartesian_product([County_List, Date_List])
BaseFrame = pd.DataFrame(dict(FIPS=CL, Date=DL)).sort_values(['FIPS','Date']).reset_index(drop=True)

In [None]:
#Making individual Dataframes for each type of Gas particle collected
#Doing Ozone First
Ozone = Air_Qual.loc[Air_Qual['OZONE_AQI'].astype(float) >= -1]
Ozone = Ozone.drop(columns=['PM10_AQI','PM25_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
#Setting the datapoint to be a float
Ozone = Ozone.astype({'OZONE_AQI': 'float64'}).sort_values(['Latitude','Longitude','ValidDate'])
#Grouping data together, for each set of idential Latitude and longitude coords with same date, the values are averaged
Ozone = Ozone.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
#Making County_Ozone data from BaseFrame
County_Ozone = BaseFrame.copy()
#creating new 
County_Ozone['OZONE_AQI'] = County_Ozone.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], Ozone, 'OZONE_AQI', County_Centers), axis = 1)

1001
1003
1005
1007
1009
1011
1013
1015
1017
1019
1021
1023
1025
1027
1029
1031
1033
1035
1037
1039
1041
1043
1045
1047
1049
1051
1053
1055
1057
1059
1061
1063
1065
1067
1069
1071
1073
1075
1077
1079
1081
1083
1085
1087
1089
1091
1093
1095
1097
1099
1101
1103
1105
1107
1109
1111
1113
1115
1117
1119
1121
1123
1125
1127
1129
1131
1133
2013
2016
2020
2050
2060
2068
2070
2090
2100
2105
2110
2122
2130
2150
2164
2170
2180
2185
2188
2195
2198
2220
2230
2240
2261
2270
2275
2282
2290
4001
4003
4005
4007
4009
4011
4012
4013
4015
4017
4019
4021
4023
4025
4027
5001
5003
5005
5007
5009
5011
5013
5015
5017
5019
5021
5023
5025
5027
5029
5031
5033
5035
5037
5039
5041
5043
5045
5047
5049
5051
5053
5055
5057
5059
5061
5063
5065
5067
5069
5071
5073
5075
5077
5079
5081
5083
5085
5087
5089
5091
5093
5095
5097
5099
5101
5103
5105
5107
5109
5111
5113
5115
5117
5119
5121
5123
5125
5127
5129
5131
5133
5135
5137
5139
5141
5143
5145
5147
5149
6001
6003
6005
6007
6009
6011
6013
6015
6017
6019
6021
6023
6025
6027


In [None]:
County_Ozone.to_csv('County_Ozone_Filled.csv')
County_Ozone

In [None]:
#Now doing PM10
PM10 = Air_Qual.loc[Air_Qual['PM10_AQI'].astype(float) >= -1]
PM10 = PM10.drop(columns=['OZONE_AQI','PM25_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
PM10 = PM10.astype({'PM10_AQI': 'float64'}).sort_values(['Latitude','Longitude','ValidDate'])
#Grouping data together, for each set of idential Latitude and longitude coords with same date, the values are averaged
PM10 = PM10.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
#Making County_Ozone data from BaseFrame
County_PM10 = BaseFrame.copy()
#creating new 
County_PM10['PM10_AQI'] = County_PM10.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], PM10, 'PM10_AQI', County_Centers, neighbors=10), axis = 1)


In [None]:
County_PM10.to_csv('County_PM10_Filled.csv')
County_PM10

In [None]:
#Now doing PM25
PM25 = Air_Qual.loc[Air_Qual['PM25_AQI'].astype(float) >= -1]
PM25 = PM25.drop(columns=['OZONE_AQI','PM10_AQI','NO2_AQI','CO','CO_Unit','SO2','SO2_Unit'])
PM25 = PM25.astype({'PM25_AQI': 'float64'}).sort_values(['Latitude','Longitude','ValidDate'])
PM25 = PM25.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
County_PM25 = BaseFrame.copy()
County_PM25['PM25_AQI'] = County_PM25.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], PM25, 'PM25_AQI', County_Centers, neighbors=10), axis = 1)


In [None]:
County_PM25.to_csv('County_PM25_Filled.csv')
County_PM25

In [None]:
#Now doing NO2
NO2 = Air_Qual.loc[Air_Qual['NO2_AQI'].astype(float) >= -1]
NO2 = NO2.drop(columns=['OZONE_AQI','PM10_AQI','PM25_AQI','CO','CO_Unit','SO2','SO2_Unit'])
NO2 = NO2.astype({'NO2_AQI': 'float64'}).sort_values(['Latitude','Longitude','ValidDate'])
NO2 = NO2.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
County_NO2 = BaseFrame.copy()
County_NO2['NO2_AQI'] = County_NO2.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], NO2, 'NO2_AQI', County_Centers, neighbors=10), axis = 1)
County_NO2.to_csv('County_NO2.csv')

In [None]:
County_NO2.to_csv('County_NO2_Filled.csv')
County_NO2

In [None]:
#Now doing CO
CO = Air_Qual.loc[Air_Qual['CO_Unit'].isin(['PPM','PPB'])]
CO = CO.drop(columns=['PM10_AQI','PM25_AQI','OZONE_AQI', 'NO2_AQI','SO2','SO2_Unit'])
CO = CO.astype({'CO': 'float64'})
#Scaling every measurement to be in terms of PPB, so multiply PPM*1000
CO.loc[CO['CO_Unit'] == 'PPM',['CO']] = CO.loc[CO['CO_Unit'] == 'PPM']['CO'].mul(1000).to_numpy()
#Drop unit column, everythin is PPB
CO = CO.drop(columns=['CO_Unit'])
CO = CO.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
County_CO = BaseFrame.copy()
County_CO['CO'] = County_PM25.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], CO, 'CO', County_Centers, neighbors=10), axis = 1)


In [None]:
County_CO.to_csv('County_CO_Filled.csv')
County_CO

In [None]:
#Now doing SO2
SO2 = Air_Qual.loc[Air_Qual['SO2_Unit'].isin(['PPM','PPB'])]
SO2 = SO2.drop(columns=['PM10_AQI','PM25_AQI','OZONE_AQI', 'NO2_AQI','CO','CO_Unit'])
SO2 = SO2.astype({'SO2': 'float64'})
#Scaling every measurement to be in terms of PPB, so multiply PPM*1000
SO2.loc[SO2['SO2_Unit'] == 'PPM',['SO2']] = SO2.loc[SO2['SO2_Unit'] == 'PPM']['SO2'].mul(1000).to_numpy()
#Drop unit column, everythin is PPB
SO2 = SO2.drop(columns=['SO2_Unit'])
SO2 = SO2.groupby(['Latitude','Longitude','ValidDate']).mean().reset_index()
County_SO2 = BaseFrame.copy()
County_SO2['SO2'] = County_PM25.apply(lambda x: air_qual_county(x['FIPS'], x['Date'], SO2, 'SO2', County_Centers, neighbors=10), axis = 1)


In [None]:
County_SO2.to_csv('County_SO2_Filled.csv')
County_SO2

In [None]:
County_Ozone.head(100)

In [None]:
County_SO2.head(100)