In [1]:
import os
import pandas as pd
import numpy as np
import zipfile

In [2]:
data_path = f"/workspace/Assignment/Datasets"
raw_data = os.path.join(data_path, f"RawData")
preprocess_path = os.path.join(data_path, f"PreprocessedData")
air_quality_path = os.path.join(data_path, f"RawData/Air Quality Data")

# FIPS

In [3]:
counties = pd.read_csv(os.path.join(preprocess_path, f"Selected_Counties.csv"))

In [4]:
continuum_codes = pd.read_excel(os.path.join(raw_data, f"Ruralurbancontinuumcodes2023.xlsx"), sheet_name="Rural-urban Continuum Code 2023", dtype={'FIPS': str})

In [5]:
continuum_codes.head()

Unnamed: 0,FIPS,State,County_Name,Population_2020,RUCC_2023,Description
0,1001,AL,Autauga County,58805,2.0,"Metro - Counties in metro areas of 250,000 to ..."
1,1003,AL,Baldwin County,231767,3.0,Metro - Counties in metro areas of fewer than ...
2,1005,AL,Barbour County,25223,6.0,"Nonmetro - Urban population of 5,000 to 20,000..."
3,1007,AL,Bibb County,22293,1.0,Metro - Counties in metro areas of 1 million p...
4,1009,AL,Blount County,59134,1.0,Metro - Counties in metro areas of 1 million p...


In [6]:
results = []
for index1, row1 in counties.iterrows():
    state = row1['State']
    county_name = row1['Counties 1']
    if not pd.isna(county_name):
        continuum_codes_state = continuum_codes[continuum_codes['State'] == state]
        for index2, row2 in continuum_codes_state.iterrows():
            if county_name in row2['County_Name']:
                results.append({
                    'State': state,
                    'County': str(row2['County_Name']),
                    'RUCC': row2['RUCC_2023'],
                    'FIPS': row2['FIPS'],
                    'State Code': row2['FIPS'][:2],
                    'County Code': row2['FIPS'][2:]
                })
                continue
        
fips = pd.DataFrame(results)

In [7]:
fips.to_csv(os.path.join(preprocess_path, "FIPS.csv"), index=False)

# State, County Code

In [8]:
state_county_code = pd.read_excel(os.path.join(raw_data, f'2020_UA_COUNTY.xlsx'), sheet_name='2020_UA_COUNTY', dtype={'STATE': str, 'COUNTY': str})

In [9]:
state_county_code.head()

Unnamed: 0,STATE,COUNTY,STATE_NAME,COUNTY_NAME,POP_COU,HOU_COU,ALAND_COU,ALAND_Mi²_COU,POPDEN_COU,HOUDEN_COU,...,POP_RUR,POPPCT_RUR,HOU_RUR,HOUPCT_RUR,ALAND_RUR,ALAND_Mi²_RUR,ALAND_PCT_RUR,POPDEN_RUR,HOUDEN_RUR,RURALBLOCKS
0,1,1,Alabama,Autauga,58805,24350,1539634184,594.452758,98.922916,40.962044,...,23920,0.406768,9991,0.410308,1483727020,572.867002,0.963688,41.754892,17.440348,991
1,1,3,Alabama,Baldwin,231767,124148,4117656199,1589.827058,145.781265,78.088997,...,87113,0.375865,40740,0.328157,3762600021,1452.739868,0.913772,59.964624,28.043562,3181
2,1,5,Alabama,Barbour,25223,11618,2292160149,885.003034,28.500467,13.127639,...,16627,0.6592,7538,0.648821,2276027730,878.774307,0.992962,18.920672,8.577857,1011
3,1,7,Alabama,Bibb,22293,9002,1612188717,622.466064,35.814001,14.461833,...,22293,1.0,9002,1.0,1612188717,622.466064,1.0,35.814001,14.461833,1090
4,1,9,Alabama,Blount,59134,24622,1670259090,644.887035,91.69668,38.18033,...,53510,0.904894,22337,0.907197,1658933117,640.514076,0.993219,83.54227,34.873551,2207


In [10]:
state_county_names = pd.merge(fips, state_county_code, left_on=['State Code', 'County Code'], right_on=['STATE', 'COUNTY'], how='inner')
state_county_names.rename(columns={'STATE_NAME': 'State Name', 'COUNTY_NAME': 'County Name', 'State': 'State Name Code', 'County': 'County Full Name'}, inplace=True)
state_county_names = state_county_names[['State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]

In [11]:
state_county_names.head()

Unnamed: 0,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name
0,6,37,6037,California,Los Angeles,1.0,CA,Los Angeles County
1,6,29,6029,California,Kern,2.0,CA,Kern County
2,6,25,6025,California,Imperial,3.0,CA,Imperial County
3,6,33,6033,California,Lake,4.0,CA,Lake County
4,6,23,6023,California,Humboldt,5.0,CA,Humboldt County


# Population Estimates

In [12]:
population_2018_2019_df = pd.read_csv(os.path.join(raw_data, "co-est2020-alldata.csv"), encoding = "ISO-8859-1", dtype={'STATE': str, 'COUNTY':str})
population_2018_2019_df_sub = population_2018_2019_df[['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2018', 'POPESTIMATE2019']].copy()
population_2018_2019_df_sub.rename(columns={'STNAME': 'State Name', 'CTYNAME': 'County Name', 'STATE': 'State Code', 'COUNTY': 'County Code'}, inplace=True)

In [13]:
population_2020_2023_df = pd.read_csv(os.path.join(raw_data, "co-est2023-alldata.csv"), encoding = "ISO-8859-1", dtype={'STATE': str, 'COUNTY':str})
population_2020_2023_df_sub = population_2020_2023_df[['STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'POPESTIMATE2023']].copy()
population_2020_2023_df_sub.rename(columns={'STNAME': 'State Name', 'CTYNAME': 'County Name', 'STATE': 'State Code', 'COUNTY': 'County Code'}, inplace=True)

In [14]:
population_2018_2023_df = pd.merge(population_2018_2019_df_sub, population_2020_2023_df_sub, left_on=['State Code', 'County Code'], right_on=['State Code', 'County Code'], how='inner')
population_2018_2023_df.rename(columns={'State Name_x': 'State Name', 'County Name_x': 'County Name'}, inplace=True)
population_2018_2023_df_sub = population_2018_2023_df[['State Code', 'County Code', 'State Name', 'County Name', 'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'POPESTIMATE2023']]

In [15]:
population_2018_2023_df_sub.head()

Unnamed: 0,State Code,County Code,State Name,County Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,1,0,Alabama,Alabama,4891628,4907965,5031864,5050380,5073903,5108468
1,1,1,Alabama,Autauga County,55533,55769,58915,59203,59726,60342
2,1,3,Alabama,Baldwin County,218071,223565,233227,239439,246531,253507
3,1,5,Alabama,Barbour County,24887,24657,24969,24533,24700,24585
4,1,7,Alabama,Bibb County,22300,22313,22188,22359,21986,21868


In [16]:
population_sub = pd.merge(population_2018_2023_df_sub, state_county_names, left_on=['State Code', 'County Code'], right_on=['State Code', 'County Code'], how='inner')
population_sub.rename(columns={'State Name_x': 'State Name', 'County Name_y': 'County Name'}, inplace=True)

In [17]:
population_sub.head()

Unnamed: 0,State Code,County Code,State Name,County Name_x,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023,FIPS,State Name_y,County Name,RUCC,State Name Code,County Full Name
0,6,21,California,Glenn County,27899,28445,28904,28718,28360,28129,6021,California,Glenn,6.0,CA,Glenn County
1,6,23,California,Humboldt County,136502,135839,136262,134897,134944,133985,6023,California,Humboldt,5.0,CA,Humboldt County
2,6,25,California,Imperial County,181062,180439,179612,179138,178940,179057,6025,California,Imperial,3.0,CA,Imperial County
3,6,29,California,Kern County,893618,898898,905910,912709,916751,913820,6029,California,Kern,2.0,CA,Kern County
4,6,33,California,Lake County,64394,64463,68199,68645,68172,67878,6033,California,Lake,4.0,CA,Lake County


In [18]:
population_df = population_sub[['State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name', 'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'POPESTIMATE2023']].copy()

In [19]:
population_df.head()

Unnamed: 0,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,6,21,6021,California,Glenn,6.0,CA,Glenn County,27899,28445,28904,28718,28360,28129
1,6,23,6023,California,Humboldt,5.0,CA,Humboldt County,136502,135839,136262,134897,134944,133985
2,6,25,6025,California,Imperial,3.0,CA,Imperial County,181062,180439,179612,179138,178940,179057
3,6,29,6029,California,Kern,2.0,CA,Kern County,893618,898898,905910,912709,916751,913820
4,6,33,6033,California,Lake,4.0,CA,Lake County,64394,64463,68199,68645,68172,67878


# Annual AQI Data

In [20]:
years = range(2019, 2024, 1)
zip_files = [f'annual_aqi_by_county_{year}' for year in years]

# Initialize an empty list to store dataframes
df_list = []

for year in years:
    zip_file_name = os.path.join(air_quality_path, f'annual_aqi_by_county_{year}.zip')

    with zipfile.ZipFile(zip_file_name, 'r') as z:
        csv_file = z.namelist()[0]
        with z.open(csv_file) as f:
            df = pd.read_csv(f)
            df_list.append(df)

merged_aqi = pd.concat(df_list, ignore_index=True)
merged_aqi.to_csv(os.path.join(air_quality_path, 'merged_aqi_by_county_2018_2023.csv'), index=False)

In [21]:
merged_aqi

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,Baldwin,2019,271,237,34,0,0,0,0,80,52,37,0,0,220,51,0
1,Alabama,Clay,2019,107,97,10,0,0,0,0,67,50,30,0,0,0,107,0
2,Alabama,Colbert,2019,263,252,11,0,0,0,0,61,47,37,0,0,228,35,0
3,Alabama,DeKalb,2019,361,324,37,0,0,0,0,90,51,39,0,0,331,30,0
4,Alabama,Elmore,2019,228,208,20,0,0,0,0,100,50,39,0,0,228,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4969,Wyoming,Sublette,2023,211,92,113,6,0,0,0,122,84,54,0,0,209,2,0
4970,Wyoming,Sweetwater,2023,212,125,78,8,0,1,0,211,87,49,0,0,170,33,9
4971,Wyoming,Teton,2023,244,203,41,0,0,0,0,74,54,46,1,0,240,3,0
4972,Wyoming,Uinta,2023,181,181,0,0,0,0,0,32,12,6,0,0,0,0,181


In [22]:
merged_aqi_fips = pd.merge(merged_aqi, population_df, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')

In [23]:
merged_aqi_fips.to_csv(os.path.join(preprocess_path, "merged_aqi_county.csv"), index=False)

In [24]:
merged_aqi_fips[merged_aqi_fips.Year == 2019].shape

(22, 32)

In [25]:
merged_aqi[merged_aqi.Year == 2019].shape

(1020, 18)

In [26]:
for year in years:
    print(f'{year}: {merged_aqi_fips[merged_aqi_fips.Year == year].shape}')
    print(list(merged_aqi_fips[merged_aqi_fips.Year == year].County))

2019: (22, 32)
['Glenn', 'Humboldt', 'Imperial', 'Kern', 'Lake', 'Los Angeles', 'Plumas', 'Siskiyou', 'Trinity', 'Sarpy', 'Essex', 'Franklin', 'Hamilton', 'Jefferson', 'Armstrong', 'Blair', 'Indiana', 'Somerset', 'Tioga', 'Dallas', 'Hood', 'Maverick']
2020: (21, 32)
['Glenn', 'Humboldt', 'Imperial', 'Kern', 'Lake', 'Los Angeles', 'Plumas', 'Siskiyou', 'Trinity', 'Sarpy', 'Essex', 'Hamilton', 'Jefferson', 'Armstrong', 'Blair', 'Indiana', 'Somerset', 'Tioga', 'Dallas', 'Hood', 'Maverick']
2021: (21, 32)
['Glenn', 'Humboldt', 'Imperial', 'Kern', 'Lake', 'Los Angeles', 'Plumas', 'Siskiyou', 'Trinity', 'Sarpy', 'Essex', 'Hamilton', 'Jefferson', 'Armstrong', 'Blair', 'Indiana', 'Somerset', 'Tioga', 'Dallas', 'Hood', 'Maverick']
2022: (21, 32)
['Glenn', 'Humboldt', 'Imperial', 'Kern', 'Lake', 'Los Angeles', 'Plumas', 'Siskiyou', 'Trinity', 'Sarpy', 'Essex', 'Hamilton', 'Jefferson', 'Armstrong', 'Blair', 'Indiana', 'Somerset', 'Tioga', 'Dallas', 'Hood', 'Maverick']
2023: (21, 32)
['Glenn', 'Hu

In [27]:
merged_aqi_fips[merged_aqi_fips.State == 'California'].shape

(45, 32)

In [28]:
population_df[population_df['State Name'] == 'California']

Unnamed: 0,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,6,21,6021,California,Glenn,6.0,CA,Glenn County,27899,28445,28904,28718,28360,28129
1,6,23,6023,California,Humboldt,5.0,CA,Humboldt County,136502,135839,136262,134897,134944,133985
2,6,25,6025,California,Imperial,3.0,CA,Imperial County,181062,180439,179612,179138,178940,179057
3,6,29,6029,California,Kern,2.0,CA,Kern County,893618,898898,905910,912709,916751,913820
4,6,33,6033,California,Lake,4.0,CA,Lake County,64394,64463,68199,68645,68172,67878
5,6,37,6037,California,Los Angeles,1.0,CA,Los Angeles County,10061533,10011602,9992813,9809462,9719765,9663345
6,6,63,6063,California,Plumas,9.0,CA,Plumas County,18830,19052,19746,19958,19443,19131
7,6,93,6093,California,Siskiyou,7.0,CA,Siskiyou County,43624,43663,43995,44139,43786,42905
8,6,105,6105,California,Trinity,8.0,CA,Trinity County,12598,12337,16091,16082,15778,15670


In [29]:
merged_aqi_fips[['State', 'County']][merged_aqi_fips.State == 'California'].County.unique()

array(['Glenn', 'Humboldt', 'Imperial', 'Kern', 'Lake', 'Los Angeles',
       'Plumas', 'Siskiyou', 'Trinity'], dtype=object)

In [73]:
merged_aqi_fips.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,County Name,RUCC,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,California,Glenn,2019,362,302,60,0,0,0,0,...,Glenn,6.0,CA,Glenn County,27899,28445,28904,28718,28360,28129
1,California,Humboldt,2019,363,353,10,0,0,0,0,...,Humboldt,5.0,CA,Humboldt County,136502,135839,136262,134897,134944,133985
2,California,Imperial,2019,365,125,219,19,2,0,0,...,Imperial,3.0,CA,Imperial County,181062,180439,179612,179138,178940,179057
3,California,Kern,2019,365,119,151,87,6,1,1,...,Kern,2.0,CA,Kern County,893618,898898,905910,912709,916751,913820
4,California,Lake,2019,360,360,0,0,0,0,0,...,Lake,4.0,CA,Lake County,64394,64463,68199,68645,68172,67878


In [30]:
population_df.head()

Unnamed: 0,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,6,21,6021,California,Glenn,6.0,CA,Glenn County,27899,28445,28904,28718,28360,28129
1,6,23,6023,California,Humboldt,5.0,CA,Humboldt County,136502,135839,136262,134897,134944,133985
2,6,25,6025,California,Imperial,3.0,CA,Imperial County,181062,180439,179612,179138,178940,179057
3,6,29,6029,California,Kern,2.0,CA,Kern County,893618,898898,905910,912709,916751,913820
4,6,33,6033,California,Lake,4.0,CA,Lake County,64394,64463,68199,68645,68172,67878


In [72]:
df_melted = merged_aqi_fips.melt(id_vars=['State', 'County', 'Year', 'RUCC', 'Max AQI'], 
                    value_vars=['POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020', 
                                'POPESTIMATE2021', 'POPESTIMATE2022', 'POPESTIMATE2023'],
                    var_name='Population_Year', value_name='Population')
 
df_melted['Population_Year'] = df_melted['Population_Year'].str.extract('(\d{4})').astype(int)
 
# check if it matches 
df_filtered = df_melted[df_melted['Year'] == df_melted['Population_Year']]
 
# Group the data by RUCC and Year, then calculate the average Max AQI and average Population for each group
df_grouped_avg = df_filtered.groupby(['RUCC', 'Year']).agg({
    'Max AQI': 'mean',
    'Population': 'mean'
}).reset_index()
 
# Calculate correlation between average Max AQI and population for each RUCC
correlation_results_filtered = {}
 
for rucc in df_grouped_avg['RUCC'].unique():
    df_rucc = df_grouped_avg[df_grouped_avg['RUCC'] == rucc]
    correlation = df_rucc['Max AQI'].corr(df_rucc['Population'])
    correlation_results_filtered[rucc] = correlation
 
correlation_filtered_df = pd.DataFrame.from_dict(correlation_results_filtered, orient='index', columns=['Correlation'])
 
print(correlation_filtered_df)

     Correlation
1.0    -0.696527
2.0    -0.892048
3.0    -0.139633
4.0     0.384990
5.0    -0.629918
6.0     0.036570
7.0    -0.215649
8.0     0.520500
9.0     0.914361


### CO

In [115]:
#CO
co_data_path = "/workspace/Assignment/Datasets/RawData/Air Quality Data/Pollutants/States/CO/"
co_data_files = ["CO_2018_CA.csv", "CO_2018_NY.csv", "CO_2018_PA.csv", "CO_2018_TX.csv", 
                 "CO_2019_CA.csv", "CO_2019_NY.csv", "CO_2019_PA.csv", "CO_2019_TX.csv", 
                 "CO_2020_CA.csv", "CO_2020_NY.csv", "CO_2020_PA.csv", "CO_2020_TX.csv", 
                 "CO_2021_CA.csv", "CO_2021_NY.csv", "CO_2021_PA.csv", "CO_2021_TX.csv",
                 "CO_2022_CA.csv", "CO_2022_NY.csv", "CO_2022_PA.csv", "CO_2022_TX.csv", 
                 "CO_2023_CA.csv", "CO_2023_NY.csv", "CO_2023_PA.csv", "CO_2023_TX.csv", ]


In [116]:
dataframes = []
#years = ['2018', '2019', '2020', '2021', '2022', '2023']

for file in co_data_files:
    year = file.split('_')[1]
    df = pd.read_csv(co_data_path+file)
    df['Year'] = year
    dataframes.append(df)

merged_df_co = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_co.to_csv(co_data_path+"CO_2018_2023.csv", index=False)

In [118]:
merged_df_co.head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour CO Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude,Year
0,01/01/2018,AQS,60010009,1,0.9,ppm,10,Oakland,18,75.0,...,54.0,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935,2018
1,01/02/2018,AQS,60010009,1,1.2,ppm,14,Oakland,24,100.0,...,54.0,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935,2018
2,01/03/2018,AQS,60010009,1,1.4,ppm,16,Oakland,24,100.0,...,54.0,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935,2018
3,01/04/2018,AQS,60010009,1,0.7,ppm,8,Oakland,24,100.0,...,54.0,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935,2018
4,01/05/2018,AQS,60010009,1,0.6,ppm,7,Oakland,24,100.0,...,54.0,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935,2018


In [121]:
filtered_df_co = pd.merge(merged_df_co, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_co = filtered_df_co[['Date','Year', 'Source', 'Site ID', 'Daily Max 8-hour CO Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_co.head()

Unnamed: 0,Date,Year,Source,Site ID,Daily Max 8-hour CO Concentration,Units,Daily AQI Value,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name
0,01/01/2018,2018,AQS,60231004,0.4,ppm,5,6,23,6023,California,Humboldt,5.0,CA,Humboldt County
1,01/02/2018,2018,AQS,60231004,0.6,ppm,7,6,23,6023,California,Humboldt,5.0,CA,Humboldt County
2,01/03/2018,2018,AQS,60231004,0.7,ppm,8,6,23,6023,California,Humboldt,5.0,CA,Humboldt County
3,01/04/2018,2018,AQS,60231004,0.3,ppm,3,6,23,6023,California,Humboldt,5.0,CA,Humboldt County
4,01/05/2018,2018,AQS,60231004,0.2,ppm,2,6,23,6023,California,Humboldt,5.0,CA,Humboldt County


In [123]:
filtered_df_co['Date'] = pd.to_datetime(filtered_df_co['Date'], format='%m/%d/%Y')

# Filter for the specific year if needed (e.g., 2020)
#filtered_df_so2_2020 = filtered_df_so2[filtered_df_so2['Date'].dt.year == 2020]

# Group by State Name and County Name, then calculate the mean of Daily Max 1-hour SO2 Concentration
mean_concentration_co = filtered_df_co.groupby(['Year', 'State Name', 'County Name', 'County Code', 'State Code', 'RUCC', 'FIPS'])['Daily Max 8-hour CO Concentration'].mean().reset_index()

# Rename the column for clarity
mean_concentration_co.rename(columns={'Daily Max 8-hour CO Concentration': 'Mean Daily Max 8-hour CO Concentration'}, inplace=True)
#merged_df_so2 = pd.merge(filtered_df_so2, mean_concentration_so2, on=['State Name', 'County Name'], suffixes=('', '_Mean'))

#final_df_so2 = merged_df_so2.drop_duplicates(subset=['State Name', 'County Name'])
# Display the result
mean_concentration_co

Unnamed: 0,Year,State Name,County Name,County Code,State Code,RUCC,FIPS,Mean Daily Max 8-hour CO Concentration
0,2018,California,Humboldt,23,6,5.0,6023,0.232778
1,2018,California,Imperial,25,6,3.0,6025,0.604558
2,2018,California,Kern,29,6,2.0,6029,0.29122
3,2018,California,Los Angeles,37,6,1.0,6037,0.499778
4,2018,Texas,Dallas,113,48,1.0,48113,0.304559
5,2019,California,Humboldt,23,6,5.0,6023,0.285515
6,2019,California,Imperial,25,6,3.0,6025,0.443836
7,2019,California,Kern,29,6,2.0,6029,0.326648
8,2019,California,Los Angeles,37,6,1.0,6037,0.475232
9,2019,Texas,Dallas,113,48,1.0,48113,0.273464


In [129]:
final_df_co = pd.merge(mean_concentration_co, population_df, on=['State Code', 'County Code', 'State Name', 'County Name', 'RUCC', 'FIPS'])
final_df_co

Unnamed: 0,Year,State Name,County Name,County Code,State Code,RUCC,FIPS,Mean Daily Max 8-hour CO Concentration,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,2018,California,Humboldt,23,6,5.0,6023,0.232778,CA,Humboldt County,136502,135839,136262,134897,134944,133985
1,2018,California,Imperial,25,6,3.0,6025,0.604558,CA,Imperial County,181062,180439,179612,179138,178940,179057
2,2018,California,Kern,29,6,2.0,6029,0.29122,CA,Kern County,893618,898898,905910,912709,916751,913820
3,2018,California,Los Angeles,37,6,1.0,6037,0.499778,CA,Los Angeles County,10061533,10011602,9992813,9809462,9719765,9663345
4,2018,Texas,Dallas,113,48,1.0,48113,0.304559,TX,Dallas County,2629764,2635603,2610112,2588235,2601993,2606358
5,2019,California,Humboldt,23,6,5.0,6023,0.285515,CA,Humboldt County,136502,135839,136262,134897,134944,133985
6,2019,California,Imperial,25,6,3.0,6025,0.443836,CA,Imperial County,181062,180439,179612,179138,178940,179057
7,2019,California,Kern,29,6,2.0,6029,0.326648,CA,Kern County,893618,898898,905910,912709,916751,913820
8,2019,California,Los Angeles,37,6,1.0,6037,0.475232,CA,Los Angeles County,10061533,10011602,9992813,9809462,9719765,9663345
9,2019,Texas,Dallas,113,48,1.0,48113,0.273464,TX,Dallas County,2629764,2635603,2610112,2588235,2601993,2606358


In [145]:
population_columns = {
    2018: 'POPESTIMATE2018',
    2019: 'POPESTIMATE2019',
    2020: 'POPESTIMATE2020',
    2021: 'POPESTIMATE2021',
    2022: 'POPESTIMATE2022',
    2023: 'POPESTIMATE2023'
}

# Create a new column 'Population' and assign the appropriate population estimate
final_df_co['Population'] = final_df_co.apply(lambda row: row[population_columns[int(row['Year'])]], axis=1)

# Drop the original population estimate columns
final_df_co.drop(columns=[col for col in final_df_co.columns if col.startswith('POPESTIMATE')], inplace=True)

# Display the modified DataFrame
print(final_df_co)

KeyError: 'POPESTIMATE2018'

In [137]:
correlation_results_filtered_co= {}
for state_code, county_code, year in final_df_co[['State Code', 'County Code', 'Year']].drop_duplicates().values:
    # Filter the DataFrame for each state-county pair
    print(state_code, county_code, year)
    df_filtered_co = final_df_co[(final_df_co['State Code'] == state_code) & (final_df_co['County Code'] == county_code) & (final_df_co['Year'] == year)]
    print(df_filtered_so2)
    
    # Calculate the correlation
    #print(df_filtered_co['POPESTIMATE2020'][0])
    correlation = df_filtered_co['Population'].corr(df_filtered_co['Mean Daily Max 8-hour CO Concentration'])
    print(correlation)
    # Store the correlation in the dictionary
    correlation_results_filtered_co[(state_code, county_code)] = correlation
    break

#correlation = final_df_so2['POPESTIMATE2020'].corr(final_df_so2['Mean Daily Max 1-hour SO2 Concentration'])

# Display the result
#print("Correlation between POPESTIMATE2020 and Mean Daily Max 1-hour SO2 Concentration:", correlation)

#correlation_filtered_df = pd.DataFrame.from_dict(correlation_results_filtered_co, orient='index', columns=['Correlation'])
 

06 023 2018
   State Name County Name County Code State Code  RUCC   FIPS  \
0  California    Humboldt         023         06   5.0  06023   

   Mean Daily Max 1-hour SO2 Concentration State Name Code County Full Name  \
0                                 0.397935              CA  Humboldt County   

   POPESTIMATE2018  POPESTIMATE2019  POPESTIMATE2020  POPESTIMATE2021  \
0           136502           135839           136262           134897   

   POPESTIMATE2022  POPESTIMATE2023  
0           134944           133985  
nan


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [None]:
pb_data_files = ["Pb_2020_CA.csv", "Pb_2020_PA.csv", "Pb_2020_TX.csv"]
no2_data_files = ["NO2_2020_CA.csv", "NO2_2020_NY.csv", "NO2_2020_PA.csv", "NO2_2020_TX.csv"]
o3_data_files = ["O3_2020_CA.csv", "O3_2020_NY.csv", "O3_2020_PA.csv", "O3_2020_TX.csv"]
pm10_data_files = ["PM10_2020_NY.csv", "PM10_2020_PA.csv", "PM10_2020_TX.csv"]
pm25_data_files = ["PM25_2020_CA.csv", "PM25_2020_NY.csv", "PM25_2020_PA.csv", "PM25_2020_TX.csv"]
so2_data_files = ["SO2_2020_CA.csv", "SO2_2020_NY.csv", "SO2_2020_PA.csv", "SO2_2020_TX.csv"]

In [37]:
dataframes = []

for file in co_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_co = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_co.to_csv(co_data_path+"CO_2020_combined.csv", index=False)

In [63]:
merged_df_co.head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour CO Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2020,AQS,60010009,1,0.7,ppm,8,Oakland,18,75.0,...,Carbon monoxide,54,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935
1,01/02/2020,AQS,60010009,1,0.7,ppm,8,Oakland,24,100.0,...,Carbon monoxide,54,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935
2,01/03/2020,AQS,60010009,1,0.9,ppm,10,Oakland,24,100.0,...,Carbon monoxide,54,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935
3,01/04/2020,AQS,60010009,1,0.8,ppm,9,Oakland,24,100.0,...,Carbon monoxide,54,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935
4,01/05/2020,AQS,60010009,1,0.7,ppm,8,Oakland,24,100.0,...,Carbon monoxide,54,41860,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.743065,-122.169935


In [41]:
dataframes = []

for file in pb_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_pb = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_pb.to_csv(co_data_path+"Pb_2020_combined.csv", index=False)

In [42]:
merged_df_pb

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean Pb Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/04/2020,AQS,60070008,7,0.00135,ug/m3 SC,.,Chico-East Avenue,1,100.0,...,Lead (TSP) STP,305,17020,"Chico, CA",6,California,7,Butte,39.761680,-121.840470
1,01/16/2020,AQS,60070008,7,0.00065,ug/m3 SC,.,Chico-East Avenue,1,100.0,...,Lead (TSP) STP,305,17020,"Chico, CA",6,California,7,Butte,39.761680,-121.840470
2,01/28/2020,AQS,60070008,7,0.00065,ug/m3 SC,.,Chico-East Avenue,1,100.0,...,Lead (TSP) STP,305,17020,"Chico, CA",6,California,7,Butte,39.761680,-121.840470
3,02/09/2020,AQS,60070008,7,0.00065,ug/m3 SC,.,Chico-East Avenue,1,100.0,...,Lead (TSP) STP,305,17020,"Chico, CA",6,California,7,Butte,39.761680,-121.840470
4,02/23/2020,AQS,60070008,7,0.00174,ug/m3 SC,.,Chico-East Avenue,1,100.0,...,Lead (TSP) STP,305,17020,"Chico, CA",6,California,7,Butte,39.761680,-121.840470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2693,11/11/2020,AQS,482570020,2,0.00300,ug/m3 LC,.,Terrell Temtex,1,100.0,...,Lead (TSP) LC,192,19100,"Dallas-Fort Worth-Arlington, TX",48,Texas,257,Kaufman,32.731919,-96.317911
2694,11/23/2020,AQS,482570020,2,0.00900,ug/m3 LC,.,Terrell Temtex,1,100.0,...,Lead (TSP) LC,192,19100,"Dallas-Fort Worth-Arlington, TX",48,Texas,257,Kaufman,32.731919,-96.317911
2695,12/05/2020,AQS,482570020,2,0.00500,ug/m3 LC,.,Terrell Temtex,1,100.0,...,Lead (TSP) LC,192,19100,"Dallas-Fort Worth-Arlington, TX",48,Texas,257,Kaufman,32.731919,-96.317911
2696,12/17/2020,AQS,482570020,2,0.00600,ug/m3 LC,.,Terrell Temtex,1,100.0,...,Lead (TSP) LC,192,19100,"Dallas-Fort Worth-Arlington, TX",48,Texas,257,Kaufman,32.731919,-96.317911


In [43]:
dataframes = []

for file in no2_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_no2 = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_no2.to_csv(co_data_path+"NO2_2020_combined.csv", index=False)

In [44]:
merged_df_no2

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 1-hour NO2 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2020,AQS,60010007,1,18.5,ppb,17,Livermore,23,96.0,...,Nitrogen dioxide (NO2),74,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
1,01/02/2020,AQS,60010007,1,22.7,ppb,21,Livermore,23,96.0,...,Nitrogen dioxide (NO2),74,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
2,01/03/2020,AQS,60010007,1,23.6,ppb,22,Livermore,23,96.0,...,Nitrogen dioxide (NO2),74,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
3,01/04/2020,AQS,60010007,1,21.2,ppb,20,Livermore,23,96.0,...,Nitrogen dioxide (NO2),74,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
4,01/05/2020,AQS,60010007,1,23.0,ppb,22,Livermore,23,96.0,...,Nitrogen dioxide (NO2),74,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61805,12/27/2020,AQS,484931038,1,10.3,ppb,9,Floresville Hospital Boulevard,22,92.0,...,Nitrogen dioxide (NO2),99,41700.0,"San Antonio-New Braunfels, TX",48,Texas,493,Wilson,29.130700,-98.148100
61806,12/28/2020,AQS,484931038,1,6.7,ppb,6,Floresville Hospital Boulevard,24,100.0,...,Nitrogen dioxide (NO2),99,41700.0,"San Antonio-New Braunfels, TX",48,Texas,493,Wilson,29.130700,-98.148100
61807,12/29/2020,AQS,484931038,1,2.9,ppb,2,Floresville Hospital Boulevard,24,100.0,...,Nitrogen dioxide (NO2),99,41700.0,"San Antonio-New Braunfels, TX",48,Texas,493,Wilson,29.130700,-98.148100
61808,12/30/2020,AQS,484931038,1,3.4,ppb,3,Floresville Hospital Boulevard,24,100.0,...,Nitrogen dioxide (NO2),99,41700.0,"San Antonio-New Braunfels, TX",48,Texas,493,Wilson,29.130700,-98.148100


In [47]:
dataframes = []

for file in pm10_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_pm10 = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_pm10.to_csv(co_data_path+"Pm10_2020_combined.csv", index=False)

In [48]:
merged_df_pm10

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM10 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/04/2020,AQS,360050110,3,13,ug/m3 SC,12,IS 52,1,100.0,...,PM10 Total 0-10um STP,127,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.816000,-73.902000
1,01/10/2020,AQS,360050110,3,17,ug/m3 SC,16,IS 52,1,100.0,...,PM10 Total 0-10um STP,127,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.816000,-73.902000
2,01/22/2020,AQS,360050110,3,23,ug/m3 SC,21,IS 52,1,100.0,...,PM10 Total 0-10um STP,127,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.816000,-73.902000
3,01/28/2020,AQS,360050110,3,3,ug/m3 SC,3,IS 52,1,100.0,...,PM10 Total 0-10um STP,127,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.816000,-73.902000
4,02/03/2020,AQS,360050110,3,16,ug/m3 SC,15,IS 52,1,100.0,...,PM10 Total 0-10um STP,127,35620,"New York-Newark-Jersey City, NY-NJ-PA",36,New York,5,Bronx,40.816000,-73.902000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6124,12/05/2020,AQS,484790017,1,17,ug/m3 SC,16,Laredo Bridge,1,100.0,...,PM10 Total 0-10um STP,62,29700,"Laredo, TX",48,Texas,479,Webb,27.501826,-99.502984
6125,12/11/2020,AQS,484790017,1,36,ug/m3 SC,33,Laredo Bridge,1,100.0,...,PM10 Total 0-10um STP,62,29700,"Laredo, TX",48,Texas,479,Webb,27.501826,-99.502984
6126,12/17/2020,AQS,484790017,1,22,ug/m3 SC,20,Laredo Bridge,1,100.0,...,PM10 Total 0-10um STP,62,29700,"Laredo, TX",48,Texas,479,Webb,27.501826,-99.502984
6127,12/23/2020,AQS,484790017,1,27,ug/m3 SC,25,Laredo Bridge,1,100.0,...,PM10 Total 0-10um STP,62,29700,"Laredo, TX",48,Texas,479,Webb,27.501826,-99.502984


In [49]:
dataframes = []

for file in o3_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_o3 = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_o3.to_csv(co_data_path+"O3_2020_combined.csv", index=False)

In [50]:
merged_df_o3

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour Ozone Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2020,AQS,60010007,1,0.025,ppm,23,Livermore,17,100.0,...,Ozone,47,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
1,01/02/2020,AQS,60010007,1,0.017,ppm,16,Livermore,17,100.0,...,Ozone,47,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
2,01/03/2020,AQS,60010007,1,0.013,ppm,12,Livermore,17,100.0,...,Ozone,47,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
3,01/04/2020,AQS,60010007,1,0.028,ppm,26,Livermore,17,100.0,...,Ozone,47,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
4,01/05/2020,AQS,60010007,1,0.031,ppm,29,Livermore,17,100.0,...,Ozone,47,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111654,12/26/2020,AQS,484790016,1,0.041,ppm,38,Laredo College,17,100.0,...,Ozone,87,29700.0,"Laredo, TX",48,Texas,479,Webb,27.507904,-99.523949
111655,12/27/2020,AQS,484790016,1,0.038,ppm,35,Laredo College,17,100.0,...,Ozone,87,29700.0,"Laredo, TX",48,Texas,479,Webb,27.507904,-99.523949
111656,12/28/2020,AQS,484790016,1,0.039,ppm,36,Laredo College,17,100.0,...,Ozone,87,29700.0,"Laredo, TX",48,Texas,479,Webb,27.507904,-99.523949
111657,12/29/2020,AQS,484790016,1,0.035,ppm,32,Laredo College,17,100.0,...,Ozone,87,29700.0,"Laredo, TX",48,Texas,479,Webb,27.507904,-99.523949


In [51]:
dataframes = []

for file in pm25_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_pm25 = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_pm25.to_csv(co_data_path+"PM25_2020_combined.csv", index=False)

In [52]:
merged_df_pm25

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,Method Code,Method Description,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2020,AQS,60010007,3,8.6,ug/m3 LC,48,Livermore,1,100.0,...,170,Met One BAM-1020 Mass Monitor w/VSCC,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
1,01/02/2020,AQS,60010007,3,4.5,ug/m3 LC,25,Livermore,1,100.0,...,170,Met One BAM-1020 Mass Monitor w/VSCC,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
2,01/03/2020,AQS,60010007,3,14.2,ug/m3 LC,61,Livermore,1,100.0,...,170,Met One BAM-1020 Mass Monitor w/VSCC,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
3,01/04/2020,AQS,60010007,3,10.9,ug/m3 LC,54,Livermore,1,100.0,...,170,Met One BAM-1020 Mass Monitor w/VSCC,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
4,01/05/2020,AQS,60010007,3,7.8,ug/m3 LC,43,Livermore,1,100.0,...,170,Met One BAM-1020 Mass Monitor w/VSCC,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.687526,-121.784217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119514,12/27/2020,AQS,484790313,1,8.3,ug/m3 LC,46,World Trade Bridge,1,100.0,...,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,29700.0,"Laredo, TX",48,Texas,479,Webb,27.599444,-99.533333
119515,12/28/2020,AQS,484790313,1,8.4,ug/m3 LC,47,World Trade Bridge,1,100.0,...,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,29700.0,"Laredo, TX",48,Texas,479,Webb,27.599444,-99.533333
119516,12/29/2020,AQS,484790313,1,8.1,ug/m3 LC,45,World Trade Bridge,1,100.0,...,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,29700.0,"Laredo, TX",48,Texas,479,Webb,27.599444,-99.533333
119517,12/30/2020,AQS,484790313,1,10.5,ug/m3 LC,54,World Trade Bridge,1,100.0,...,209,Met One BAM-1022 Mass Monitor w/ VSCC or TE-PM...,29700.0,"Laredo, TX",48,Texas,479,Webb,27.599444,-99.533333


In [59]:
dataframes = []

for file in so2_data_files:
    df = pd.read_csv(co_data_path+file)
    dataframes.append(df)

merged_df_so2 = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
merged_df_so2.to_csv(co_data_path+"SO2_2020_combined.csv", index=False)

In [60]:
merged_df_so2

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 1-hour SO2 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,01/01/2020,AQS,60010011,1,0.5,ppb,0,Oakland West,22,92.0,...,Sulfur dioxide,60,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.814781,-122.282347
1,01/02/2020,AQS,60010011,1,1.2,ppb,1,Oakland West,22,92.0,...,Sulfur dioxide,60,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.814781,-122.282347
2,01/03/2020,AQS,60010011,1,1.8,ppb,1,Oakland West,19,79.0,...,Sulfur dioxide,60,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.814781,-122.282347
3,01/04/2020,AQS,60010011,1,1.4,ppb,1,Oakland West,22,92.0,...,Sulfur dioxide,60,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.814781,-122.282347
4,01/05/2020,AQS,60010011,1,0.7,ppb,0,Oakland West,22,92.0,...,Sulfur dioxide,60,41860.0,"San Francisco-Oakland-Hayward, CA",6,California,1,Alameda,37.814781,-122.282347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34560,12/27/2020,AQS,484530014,2,0.7,ppb,0,Austin North Hills Drive,24,100.0,...,Sulfur dioxide,100,12420.0,"Austin-Round Rock, TX",48,Texas,453,Travis,30.354944,-97.761803
34561,12/28/2020,AQS,484530014,2,0.0,ppb,0,Austin North Hills Drive,24,100.0,...,Sulfur dioxide,100,12420.0,"Austin-Round Rock, TX",48,Texas,453,Travis,30.354944,-97.761803
34562,12/29/2020,AQS,484530014,2,0.1,ppb,0,Austin North Hills Drive,24,100.0,...,Sulfur dioxide,100,12420.0,"Austin-Round Rock, TX",48,Texas,453,Travis,30.354944,-97.761803
34563,12/30/2020,AQS,484530014,2,0.1,ppb,0,Austin North Hills Drive,24,100.0,...,Sulfur dioxide,100,12420.0,"Austin-Round Rock, TX",48,Texas,453,Travis,30.354944,-97.761803


In [64]:
state_county_names.head()

Unnamed: 0,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name
0,6,37,6037,California,Los Angeles,1.0,CA,Los Angeles County
1,6,29,6029,California,Kern,2.0,CA,Kern County
2,6,25,6025,California,Imperial,3.0,CA,Imperial County
3,6,33,6033,California,Lake,4.0,CA,Lake County
4,6,23,6023,California,Humboldt,5.0,CA,Humboldt County


In [70]:
filtered_df_co = pd.merge(merged_df_co, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_pb = pd.merge(merged_df_pb, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_no2 = pd.merge(merged_df_no2, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_pm10 = pd.merge(merged_df_pm10, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_pm25 = pd.merge(merged_df_pm25, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_o3 = pd.merge(merged_df_o3, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')
filtered_df_so2 = pd.merge(merged_df_so2, state_county_names, left_on=['State', 'County'], right_on=['State Name', 'County Name'], how='inner')

In [81]:
filtered_df_pb.head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean Pb Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,Site Latitude,Site Longitude,State Code,County Code,FIPS,State Name,County Name,RUCC,State Name Code,County Full Name
0,01/04/2020,AQS,60250005,7,0.0138,ug/m3 SC,.,Calexico-Ethel Street,1,100.0,...,32.67618,-115.48307,6,25,6025,California,Imperial,3.0,CA,Imperial County
1,01/16/2020,AQS,60250005,7,0.0195,ug/m3 SC,.,Calexico-Ethel Street,1,100.0,...,32.67618,-115.48307,6,25,6025,California,Imperial,3.0,CA,Imperial County
2,01/28/2020,AQS,60250005,7,0.00946,ug/m3 SC,.,Calexico-Ethel Street,1,100.0,...,32.67618,-115.48307,6,25,6025,California,Imperial,3.0,CA,Imperial County
3,02/09/2020,AQS,60250005,7,0.00143,ug/m3 SC,.,Calexico-Ethel Street,1,100.0,...,32.67618,-115.48307,6,25,6025,California,Imperial,3.0,CA,Imperial County
4,02/21/2020,AQS,60250005,7,0.00846,ug/m3 SC,.,Calexico-Ethel Street,1,100.0,...,32.67618,-115.48307,6,25,6025,California,Imperial,3.0,CA,Imperial County


In [82]:
filtered_df_co = filtered_df_co[['Date', 'Source', 'Site ID', 'Daily Max 8-hour CO Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_pb = filtered_df_pb[['Date', 'Source', 'Site ID', 'Daily Mean Pb Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_no2 = filtered_df_no2[['Date', 'Source', 'Site ID', 'Daily Max 1-hour NO2 Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_pm10 = filtered_df_pm10[['Date', 'Source', 'Site ID', 'Daily Mean PM10 Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_pm25 = filtered_df_pm25[['Date', 'Source', 'Site ID', 'Daily Mean PM2.5 Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_o3 = filtered_df_o3[['Date', 'Source', 'Site ID', 'Daily Max 8-hour Ozone Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]
filtered_df_so2 = filtered_df_so2[['Date', 'Source', 'Site ID', 'Daily Max 1-hour SO2 Concentration', 'Units', 'Daily AQI Value', 'State Code', 'County Code', 'FIPS', 'State Name', 'County Name', 'RUCC', 'State Name Code', 'County Full Name']]

In [112]:
filtered_df_so2[['State Name', 'County Name']]

Unnamed: 0,State Name,County Name
0,California,Humboldt
1,California,Humboldt
2,California,Humboldt
3,California,Humboldt
4,California,Humboldt
...,...,...
3089,Texas,Dallas
3090,Texas,Dallas
3091,Texas,Dallas
3092,Texas,Dallas


In [93]:
filtered_df_so2['Date'] = pd.to_datetime(filtered_df_so2['Date'], format='%m/%d/%Y')

# Filter for the specific year if needed (e.g., 2020)
filtered_df_so2_2020 = filtered_df_so2[filtered_df_so2['Date'].dt.year == 2020]

# Group by State Name and County Name, then calculate the mean of Daily Max 1-hour SO2 Concentration
mean_concentration_so2 = filtered_df_so2_2020.groupby(['State Name', 'County Name', 'County Code', 'State Code', 'RUCC', 'FIPS'])['Daily Max 1-hour SO2 Concentration'].mean().reset_index()

# Rename columns for clarity
mean_concentration_so2.rename(columns={'Daily Max 1-hour SO2 Concentration': 'Mean Daily Max 1-hour SO2 Concentration'}, inplace=True)

#merged_df_so2 = pd.merge(filtered_df_so2, mean_concentration_so2, on=['State Name', 'County Name'], suffixes=('', '_Mean'))

#final_df_so2 = merged_df_so2.drop_duplicates(subset=['State Name', 'County Name'])
# Display the result
mean_concentration_so2

Unnamed: 0,State Name,County Name,County Code,State Code,RUCC,FIPS,Mean Daily Max 1-hour SO2 Concentration
0,California,Humboldt,23,6,5.0,6023,0.397935
1,California,Imperial,25,6,3.0,6025,1.496927
2,California,Los Angeles,37,6,1.0,6037,0.757349
3,New York,Essex,31,36,6.0,36031,0.254622
4,New York,Hamilton,41,36,8.0,36041,0.093239
5,Pennsylvania,Blair,13,42,3.0,42013,0.539548
6,Pennsylvania,Indiana,63,42,4.0,42063,2.469101
7,Texas,Dallas,113,48,1.0,48113,0.82242


In [96]:
final_df_so2 = pd.merge(mean_concentration_so2, population_df, on=['State Code', 'County Code', 'State Name', 'County Name', 'RUCC', 'FIPS'])
final_df_so2

Unnamed: 0,State Name,County Name,County Code,State Code,RUCC,FIPS,Mean Daily Max 1-hour SO2 Concentration,State Name Code,County Full Name,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE2020,POPESTIMATE2021,POPESTIMATE2022,POPESTIMATE2023
0,California,Humboldt,23,6,5.0,6023,0.397935,CA,Humboldt County,136502,135839,136262,134897,134944,133985
1,California,Imperial,25,6,3.0,6025,1.496927,CA,Imperial County,181062,180439,179612,179138,178940,179057
2,California,Los Angeles,37,6,1.0,6037,0.757349,CA,Los Angeles County,10061533,10011602,9992813,9809462,9719765,9663345
3,New York,Essex,31,36,6.0,36031,0.254622,NY,Essex County,37309,36987,37275,37233,36763,36775
4,New York,Hamilton,41,36,8.0,36041,0.093239,NY,Hamilton County,4453,4438,5079,5122,5121,5082
5,Pennsylvania,Blair,13,42,3.0,42013,0.539548,PA,Blair County,122567,121942,122666,122150,120734,120273
6,Pennsylvania,Indiana,63,42,4.0,42063,2.469101,PA,Indiana County,84578,84022,83151,83201,83064,83094
7,Texas,Dallas,113,48,1.0,48113,0.82242,TX,Dallas County,2629764,2635603,2610112,2588235,2601993,2606358


In [108]:
# df_filtered_pb = filtered_df_pb.melt(id_vars=['State', 'County', 'Year', 'RUCC', 'Max AQI'], 
#                     value_vars=['POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE2020', 
#                                 'POPESTIMATE2021', 'POPESTIMATE2022', 'POPESTIMATE2023'],
#                     var_name='Population_Year', value_name='Population')
 
# df_melted['Population_Year'] = df_melted['Population_Year'].str.extract('(\d{4})').astype(int)
 
# check if it matches 
# df_filtered = df_melted[df_melted['Year'] == df_melted['Population_Year']]
 
# Group the data by RUCC and Year, then calculate the average Max AQI and average Population for each group
# df_grouped_avg = df_filtered.groupby(['RUCC', 'Year']).agg({
#     'Max AQI': 'mean',
#     'Population': 'mean'
# }).reset_index()
 
# Calculate correlation between average Max AQI and population for each RUCC
correlation_results_filtered_so2= {}
for state_code, county_code in final_df_so2[['State Code', 'County Code']].drop_duplicates().values:
    # Filter the DataFrame for each state-county pair
    print(state_code, county_code)
    df_filtered_so2 = final_df_so2[(final_df_so2['State Code'] == state_code) & (final_df_so2['County Code'] == county_code)]
    #print(df_filtered_so2)
    
    # Calculate the correlation
    print(df_filtered_so2['POPESTIMATE2020'][0])
    correlation = df_filtered_so2['POPESTIMATE2020'].corr(df_filtered_so2['Mean Daily Max 1-hour SO2 Concentration'])
    print(correlation)
    # Store the correlation in the dictionary
    correlation_results_filtered_so2[(state_code, county_code)] = correlation
    break

#correlation = final_df_so2['POPESTIMATE2020'].corr(final_df_so2['Mean Daily Max 1-hour SO2 Concentration'])

# Display the result
#print("Correlation between POPESTIMATE2020 and Mean Daily Max 1-hour SO2 Concentration:", correlation)

correlation_filtered_df = pd.DataFrame.from_dict(correlation_results_filtered_so2, orient='index', columns=['Correlation'])
 
#print(correlation_filtered_df)

06 023
136262


AttributeError: 'numpy.int64' object has no attribute 'corr'