# Loading data

In [2]:
import pandas as pd
import numpy as np
import geopy
import geopandas
import csv, json
from geojson import Feature, FeatureCollection, Point

data = pd.read_csv('usgs_phos.csv', low_memory=False)

In [3]:
# just a toy to sample the data
# data_little = pd.read_csv('usgs_phos.csv', nrows=12)
# data_little

In [4]:
# exploratory data analysis
# data.columns
# data.shape[0] #1758481

In [5]:
# df['column2'] = np.where((df['column2'] == 'Null') | (df['column2'] == 0), df['column1'], df['column2'])

# what's going on in minnesota

In [6]:
ohio = data[data['OrganizationIdentifier'] == 'MNPCA']

In [7]:
ohio.to_csv('mnpca.csv')

# Shaping Data

In [8]:
# creating a new dataframe that is condensed and only containing the fields we need, starting with 1758481 rows

data_condensed = data[['OrganizationIdentifier','ActivityMediaSubdivisionName','ActivityStartDate','ResultMeasureValue','ResultMeasure/MeasureUnitCode','ActivityLocation/LatitudeMeasure','ActivityLocation/LongitudeMeasure','HydrologicEvent']].copy()
len(data_condensed)

1758481

In [9]:
data_condensed['ActivityMediaSubdivisionName'].value_counts()

ActivityMediaSubdivisionName
Surface Water                          1015046
Stormwater                                 728
Groundwater                                672
Interstitial                                55
Drinking Water                              41
Effluent                                    37
Hyporheic zone                              15
Leachate                                    12
Wet Fall Material                            8
Snowmelt                                     8
Surface Soil/Sediment                        5
Industrial Effluent                          4
Interstitial Water                           3
Ambient Air                                  1
Wastewater Treatment Plant Influent          1
Name: count, dtype: int64

In [10]:
# if there is an NaN in any field besides HydrologicEvent or ActivityMediaSubdivisionName, drop the row. ending with 630448 rows

data_condensed.dropna(subset=['OrganizationIdentifier','ActivityStartDate','ResultMeasureValue','ResultMeasure/MeasureUnitCode','ActivityLocation/LatitudeMeasure','ActivityLocation/LongitudeMeasure'],inplace=True)
data_condensed.shape #1304819
data_condensed

Unnamed: 0,OrganizationIdentifier,ActivityMediaSubdivisionName,ActivityStartDate,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure,HydrologicEvent
375300,21VASWCB,Surface Water,2010-10-28,.01,mg/L,38.870300,-78.684000,
375301,21KAN001_WQX,,2010-05-04,0.22000,mg/L,38.901090,-97.222520,
375302,MNPCA,Surface Water,2010-09-01,0.061,mg/L,44.926900,-93.362400,
375303,21VASWCB,Surface Water,2010-04-28,.02,mg/L,37.257500,-78.486700,
375304,21AWIC,,2010-09-09,.028,mg/L,33.575000,-88.083400,
...,...,...,...,...,...,...,...,...
1758476,21VASWCB,Surface Water,2024-02-12,.03,mg/L,36.899400,-81.981700,
1758477,21VASWCB,Surface Water,2024-01-10,.03,mg/L,36.639200,-82.491400,
1758478,21VASWCB,Surface Water,2024-01-22,.03,mg/L,36.589400,-79.420300,
1758479,ESTO_WQX,,2024-01-30,0.04100000,mg/L,36.830115,-94.627585,


In [11]:
# convert the ResultMeasureValue field to numeric, coerce errors so that the values that look like "<0.05" show up as 0

data_condensed['ResultMeasureValue'] = pd.to_numeric(data_condensed['ResultMeasureValue'], errors='coerce')
data_condensed['ResultMeasureValue'] = data_condensed['ResultMeasureValue'].fillna(0)
data_condensed

Unnamed: 0,OrganizationIdentifier,ActivityMediaSubdivisionName,ActivityStartDate,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure,HydrologicEvent
375300,21VASWCB,Surface Water,2010-10-28,0.010,mg/L,38.870300,-78.684000,
375301,21KAN001_WQX,,2010-05-04,0.220,mg/L,38.901090,-97.222520,
375302,MNPCA,Surface Water,2010-09-01,0.061,mg/L,44.926900,-93.362400,
375303,21VASWCB,Surface Water,2010-04-28,0.020,mg/L,37.257500,-78.486700,
375304,21AWIC,,2010-09-09,0.028,mg/L,33.575000,-88.083400,
...,...,...,...,...,...,...,...,...
1758476,21VASWCB,Surface Water,2024-02-12,0.030,mg/L,36.899400,-81.981700,
1758477,21VASWCB,Surface Water,2024-01-10,0.030,mg/L,36.639200,-82.491400,
1758478,21VASWCB,Surface Water,2024-01-22,0.030,mg/L,36.589400,-79.420300,
1758479,ESTO_WQX,,2024-01-30,0.041,mg/L,36.830115,-94.627585,


# Mitigating Unit Measure Issues

In [12]:
# dealing with differing units

# mg (milligrams) per liter = mg per 1,000cc and 1000 cc of water weighs 1 kg. Therefore, 1 mg/L is the same as 1 mg/kg if you are talking about water. (ResearchGate, https://www.researchgate.net/post/Converting_mg_l_to_mg_kg#:~:text=mg%20(milligrams)%20per%20liter%20%3D,you%20are%20talking%20about%20water.)

# conversion for ppb and ppm to mg/L chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://cfpub.epa.gov/ncer_abstracts/index.cfm/fuseaction/display.files/fileid/14285

# convert ug/L to mg/L --> divide by 1000
# convert ppb to mg/L --> divide by 1000
# convert ppm to mg/L --> just change units
# convert mg/kg to mg/L --> divide by 1000

data_condensed['ResultMeasure/MeasureUnitCode'].unique()
data_condensed['ResultMeasure/MeasureUnitCode'].value_counts()

ResultMeasure/MeasureUnitCode
mg/L         1162932
ug/L          128348
ppm             8611
ppb             3979
mg/kg            634
ug               158
%                144
count              6
mL/L               4
MPN/100mL          2
#/100mL            1
Name: count, dtype: int64

In [13]:
### --- investigation 'ug' only --- ###

# 0.325 to about 1.6, surface water for the 'ug'
# looks like this UG code is in OrganizationIdentifier called "21VASWCB", which is the VIRGINIA DEPARTMENT OF ENVIRONMENTAL QUALITY. confirmed via value_counts that this is the only org using 'ug'
ug = data_condensed.loc[data_condensed['ResultMeasure/MeasureUnitCode'] == 'ug']
ug['OrganizationIdentifier'].value_counts() #158
ug['ResultMeasureValue'].max() #1.62
ug['ResultMeasureValue'].min() #0.325

0.325

In [14]:
# how comparable are ug and ug/L? are they of the same scale? we can treat these differently
ugL = data_condensed.loc[data_condensed['ResultMeasure/MeasureUnitCode'] == 'ug/L']
# ugL['ResultMeasureValue'].max() #2971.9
# ugL['ResultMeasureValue'].min() #-0.948print(ugL.iloc[2])

In [15]:
# looking at one specific row, 2, current value is 1.234
print(ugL.iloc[2])

OrganizationIdentifier                 1VTDECWQ
ActivityMediaSubdivisionName                NaN
ActivityStartDate                    2010-08-19
ResultMeasureValue                         13.4
ResultMeasure/MeasureUnitCode              ug/L
ActivityLocation/LatitudeMeasure        44.5819
ActivityLocation/LongitudeMeasure      -73.2811
HydrologicEvent                             NaN
Name: 375362, dtype: object


In [16]:
# convertsion, divide all cells by 1000 if ResultMeasureUnit is indicated above else ResultMeasureValue

data_condensed['ResultMeasureValue'] = data_condensed.apply(lambda x: x['ResultMeasureValue']/1000 if x['ResultMeasure/MeasureUnitCode']=='ug/L' else x['ResultMeasureValue'], axis=1)

data_condensed['ResultMeasureValue'] = data_condensed.apply(lambda x: x['ResultMeasureValue']/1000 if x['ResultMeasure/MeasureUnitCode']=='ppb' else x['ResultMeasureValue'], axis=1)

data_condensed['ResultMeasureValue'] = data_condensed.apply(lambda x: x['ResultMeasureValue']/1000 if x['ResultMeasure/MeasureUnitCode']=='mg/kg' else x['ResultMeasureValue'], axis=1)

In [17]:
# confirmed! value was divided by 1000

ugL = data_condensed.loc[data_condensed['ResultMeasure/MeasureUnitCode'] == 'ug/L']
print(ugL.iloc[2])

OrganizationIdentifier                 1VTDECWQ
ActivityMediaSubdivisionName                NaN
ActivityStartDate                    2010-08-19
ResultMeasureValue                       0.0134
ResultMeasure/MeasureUnitCode              ug/L
ActivityLocation/LatitudeMeasure        44.5819
ActivityLocation/LongitudeMeasure      -73.2811
HydrologicEvent                             NaN
Name: 375362, dtype: object


In [18]:
# now fixing units for the values that were changed, from ug/L to mg/L & mg/kg to mg/L

data_condensed['ResultMeasure/MeasureUnitCode'].replace('ug/L','mg/L',inplace=True)
data_condensed['ResultMeasure/MeasureUnitCode'].replace('mg/kg','mg/L',inplace=True)
data_condensed['ResultMeasure/MeasureUnitCode'].replace('ppb','mg/L',inplace=True)
data_condensed['ResultMeasure/MeasureUnitCode'].replace('ppm','mg/L',inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_condensed['ResultMeasure/MeasureUnitCode'].replace('ug/L','mg/L',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_condensed['ResultMeasure/MeasureUnitCode'].replace('mg/kg','mg/L',inplace=True)
The behavior will change in pandas 3.0. This inplace method wil

In [19]:
# confirming that worked

data_condensed['ResultMeasure/MeasureUnitCode'].value_counts()

ResultMeasure/MeasureUnitCode
mg/L         1304504
ug               158
%                144
count              6
mL/L               4
MPN/100mL          2
#/100mL            1
Name: count, dtype: int64

In [20]:
#TODO: what do we want to do with the "ug"? drop them? think that they're a typo in VA?

In [21]:
# include only rows with mg/L; the remaining rows represent an insignificant number of rows
data_condensed = data_condensed[data_condensed['ResultMeasure/MeasureUnitCode'] == 'mg/L']

In [22]:
data_condensed['ResultMeasure/MeasureUnitCode'].value_counts() #1304504

ResultMeasure/MeasureUnitCode
mg/L    1304504
Name: count, dtype: int64

# Using GeoPandas to spatially join larger geographical mappings to lat-long usgs data

In [23]:
#creating a further condensed data set with just the lat and lon points
df2 = pd.DataFrame()
df2['Lat'] = data_condensed['ActivityLocation/LatitudeMeasure']
df2['Lon'] = data_condensed['ActivityLocation/LongitudeMeasure']

In [24]:
# there is ratelimiting on this package according to its terms of use. about 2000 requests take 15 mintues... we have 630K rows. showing what *didn't* work

# def get_zipcode(df, geolocator, lat_field, lon_field, attempt=1, max_attempts=100):
#     try:
#         location = geolocator.reverse((df[lat_field], df[lon_field]), timeout=None)
#         return location.raw['address']['postcode']
#     except KeyError:
#         pass
#     except GeocoderTimedOut:
#         if attempt <= max_attempts:
#             return get_zipcode (df, attempt=attempt+1)
#         raise
#
# geolocator = geopy.Nominatim(user_agent='cara-umsi')
# zipcodes = df2.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')

In [25]:
# note that in this csv, i renamed the fields to 'latitutde', 'longitude', and 'position', copying the index. later, to make the geopandas work, i had to delete the field names
df2.to_csv('latlong.csv', header=False)

In [26]:
# reading the csv, transforming it to geojason (remember i deleted column labels, probably could've just started to read at row 1...
features = []
with open('latlong.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for position, latitude, longitude in reader:
        latitude, longitude = map(float, (latitude, longitude))
        features.append(
            Feature(
                geometry = Point((longitude, latitude)),
                properties = {
                    'position': position,
                }
            )
        )

collection = FeatureCollection(features)
with open("GeoObs.json", "w") as f:
    f.write('%s' % collection)

# Sjoining for ZIP codes

In [27]:
# this chunk takes the geojson file generated above and sjoins it (spatial joins of two geojson files, finding intersections) with zipcode data
# https://stackoverflow.com/questions/48586647/python-script-to-convert-csv-to-geojson
# get zip code data from ArcGIS https://www.arcgis.com/home/item.html?id=8d2012a2016e484dafaac0451f9aea24
# first need to extract the .lpk zip codes file locally, then run the below

points = geopandas.read_file('GeoObs.json')
zipcodes = geopandas.read_file("zip_poly.gdb")
zip_points = points.sjoin(zipcodes, how='left', )

KeyboardInterrupt: 

In [None]:
zip_points

In [None]:
zip_points.to_csv('zip_points.csv')

# SJoining for counties

https://hub.arcgis.com/datasets/esri::usa-counties/about

In [28]:
# points = geopandas.read_file('GeoObs.json')
# counties = geopandas.read_file("USA_counties_gdb.gdb")
# county_points = points.sjoin(counties, how='left', )

points = geopandas.read_file('GeoObs.json')
counties = geopandas.read_file("USA_counties_gdb.gdb").to_crs(4326)
county_points = points.sjoin(counties, how='left', )

In [29]:
county_points

Unnamed: 0,position,geometry,index_right,NAME,STATE_NAME,STATE_FIPS,CNTY_FIPS,FIPS,POPULATION,POP_SQMI,...,CROP_ACR12,AVE_SALE12,SQMI,NO_FARMS17,AVE_SIZE17,CROP_ACR17,AVE_SALE17,Shape_Leng,SHAPE_Length,SHAPE_Area
0,375300,POINT (-78.68400 38.87030),2900.0,Shenandoah County,Virginia,51,171,51171,44315.0,86.6,...,52726.0,131394.0,511.83,965.0,135.0,61237.0,148006.0,2.090674,263387.947604,0.137575
1,375301,POINT (-97.22252 38.90109),907.0,Dickinson County,Kansas,20,041,20041,19258.0,22.6,...,335353.0,155342.0,851.95,919.0,565.0,364449.0,162723.0,2.083000,266212.614050,0.229020
2,375302,POINT (-93.36240 44.92690),1339.0,Hennepin County,Minnesota,27,053,27053,1270988.0,2094.6,...,54284.0,102821.0,606.79,467.0,98.0,36623.0,125418.0,2.043827,269395.417723,0.179370
3,375303,POINT (-78.48670 37.25750),2889.0,Prince Edward County,Virginia,51,147,51147,23610.0,66.7,...,19680.0,39993.0,353.84,341.0,204.0,20082.0,69768.0,1.705060,213470.965578,0.093043
4,375304,POINT (-88.08340 33.57500),37.0,Lamar County,Alabama,01,075,01075,14722.0,24.3,...,13868.0,21337.0,605.47,269.0,274.0,12006.0,38507.0,1.640622,206364.489224,0.152639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304499,1758476,POINT (-81.98170 36.89940),2898.0,Russell County,Virginia,51,167,51167,31537.0,66.2,...,34004.0,32325.0,476.44,918.0,185.0,32705.0,25277.0,2.020264,250196.412852,0.124811
1304500,1758477,POINT (-82.49140 36.63920),2899.0,Scott County,Virginia,51,169,51169,23132.0,42.9,...,33436.0,10892.0,538.96,1138.0,110.0,30463.0,13672.0,1.810793,216507.753914,0.140791
1304501,1758478,POINT (-79.42030 36.58940),2921.0,Danville city,Virginia,51,590,51590,41339.0,946.0,...,-99.0,-99.0,43.70,0.0,0.0,0.0,0.0,0.742117,90094.456012,0.011397
1304502,1758479,POINT (-94.62758 36.83011),2187.0,Ottawa County,Oklahoma,40,115,40115,31406.0,64.8,...,87881.0,115291.0,484.61,947.0,217.0,92775.0,147558.0,1.502228,185329.803987,0.126790


# Merging DFs Together

In [30]:
# first change dtype from object to int for position
# zip_points['position']=zip_points['position'].astype(int)
county_points['position']=county_points['position'].astype(int)

In [31]:
# traditional inner join to merge the 2 data frames
# data_usgs = pd.merge(data_condensed,zip_points,left_index=True, right_on='position')
data_usgs = pd.merge(data_condensed,county_points,left_index=True, right_on='position')

In [None]:
# voila, chef's kiss
data_usgs #1304504

In [None]:
# data_usgs.to_csv('data_usgs.csv')

# Separating Out by State (Great Lakes Region Only)

In [32]:
great_lakes_states = ['MI','OH','PA','IL','IN','MN','NY','WI']
great_lakes_state_names = ['Michigan','Ohio','Pennsylvania','Illinois','Indiana','Minnesota','New York', 'Wisconsin']

In [33]:
# data_usgs_GL = data_usgs[data_usgs.STATE.isin(great_lakes_states)]
data_usgs_GL = data_usgs[data_usgs.STATE_NAME.isin(great_lakes_state_names)]


In [None]:
len(data_usgs_GL) #455566

# Analysis on dataset

In [None]:
# total observations in GL region (for ZIPs) = 456708
# total observations in GL region (for counties) = 455566

# data_usgs_GL.to_csv('data_usgs_GL2.csv')

In [None]:
# total zip codes is 3968
# total counties is 450

# data_usgs_GL['ZIP_CODE'].value_counts()
data_usgs_GL['NAME'].value_counts() #450

In [None]:
stats_by_state = data_usgs_GL.groupby('STATE_NAME')['ResultMeasureValue'].agg(['count','mean', 'median', 'std', 'var', 'min', 'max'])

In [None]:
stats_by_state
# stats_by_state.to_csv('stats_by_state.csv')

# Aggregating data

In [35]:
data_usgs_GL.columns

Index(['OrganizationIdentifier', 'ActivityMediaSubdivisionName',
       'ActivityStartDate', 'ResultMeasureValue',
       'ResultMeasure/MeasureUnitCode', 'ActivityLocation/LatitudeMeasure',
       'ActivityLocation/LongitudeMeasure', 'HydrologicEvent', 'position',
       'geometry', 'index_right', 'NAME', 'STATE_NAME', 'STATE_FIPS',
       'CNTY_FIPS', 'FIPS', 'POPULATION', 'POP_SQMI', 'POP2010', 'POP10_SQMI',
       'WHITE', 'BLACK', 'AMERI_ES', 'ASIAN', 'HAWN_PI', 'HISPANIC', 'OTHER',
       'MULT_RACE', 'MALES', 'FEMALES', 'AGE_UNDER5', 'AGE_5_9', 'AGE_10_14',
       'AGE_15_19', 'AGE_20_24', 'AGE_25_34', 'AGE_35_44', 'AGE_45_54',
       'AGE_55_64', 'AGE_65_74', 'AGE_75_84', 'AGE_85_UP', 'MED_AGE',
       'MED_AGE_M', 'MED_AGE_F', 'HOUSEHOLDS', 'AVE_HH_SZ', 'HSEHLD_1_M',
       'HSEHLD_1_F', 'MARHH_CHD', 'MARHH_NO_C', 'MHH_CHILD', 'FHH_CHILD',
       'FAMILIES', 'AVE_FAM_SZ', 'HSE_UNITS', 'VACANT', 'OWNER_OCC',
       'RENTER_OCC', 'NO_FARMS12', 'AVE_SIZE12', 'CROP_ACR12', 'AVE_SA

In [45]:
# rename columns, pull out desired, separate out the imputing data frame from the descriptive dataframe... intent to merge back together later on
data_usgs_GL.rename(columns={'NAME': 'CountyName'}, inplace=True)
data_usgs_GL['CountyState'] = data_usgs_GL['CountyName']+'-'+ data_usgs_GL['STATE_NAME']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_usgs_GL.rename(columns={'NAME': 'CountyName'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_usgs_GL['CountyState'] = data_usgs_GL['CountyName']+'-'+ data_usgs_GL['STATE_NAME']


In [46]:
data_usgs_GL

Unnamed: 0,OrganizationIdentifier,ActivityMediaSubdivisionName,ActivityStartDate,ResultMeasureValue,ResultMeasure/MeasureUnitCode,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure,HydrologicEvent,position,geometry,...,AVE_SALE12,SQMI,NO_FARMS17,AVE_SIZE17,CROP_ACR17,AVE_SALE17,Shape_Leng,SHAPE_Length,SHAPE_Area,CountyState
2,MNPCA,Surface Water,2010-09-01,0.0610,mg/L,44.926900,-93.362400,,375302,POINT (-93.36240 44.92690),...,102821.0,606.79,467.0,98.0,36623.0,125418.0,2.043827,269395.417723,0.179370,Hennepin County-Minnesota
5,MNPCA,Surface Water,2010-06-24,0.4540,mg/L,43.513917,-93.268611,,375305,POINT (-93.26861 43.51392),...,370784.0,722.42,1076.0,366.0,374911.0,338289.0,1.895490,240696.187148,0.208823,Freeborn County-Minnesota
8,MNPCA,Surface Water,2010-08-16,0.0140,mg/L,46.931197,-94.772603,,375308,POINT (-94.77260 46.93120),...,113475.0,999.49,384.0,246.0,53391.0,115220.0,2.265708,315677.698853,0.306781,Hubbard County-Minnesota
14,MNPCA,Surface Water,2010-09-07,0.3670,mg/L,43.590170,-94.681010,,375315,POINT (-94.68101 43.59017),...,690708.0,729.58,911.0,493.0,434319.0,697611.0,1.909775,242215.618493,0.210894,Martin County-Minnesota
17,MNPCA,Surface Water,2010-07-27,0.0190,mg/L,45.705203,-94.747231,,375318,POINT (-94.74723 45.70520),...,230933.0,1390.02,2951.0,221.0,515928.0,253466.0,3.195917,401039.073367,0.414828,Stearns County-Minnesota
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304457,WIDNR_WQX,,2024-01-09,0.0213,mg/L,45.371741,-91.208534,,1758434,POINT (-91.20853 45.37174),...,121366.0,930.86,501.0,272.0,70661.0,107354.0,2.417757,302007.435056,0.277423,Rusk County-Wisconsin
1304459,HO_CHUNK_WQX,,2024-01-04,0.0340,mg/L,44.037100,-89.722700,,1758436,POINT (-89.72270 44.03710),...,337599.0,687.48,308.0,381.0,84544.0,425646.0,2.125677,286964.278008,0.199696,Adams County-Wisconsin
1304467,WIDNR_WQX,,2024-01-12,0.0409,mg/L,43.650870,-89.612390,,1758444,POINT (-89.61239 43.65087),...,337599.0,687.48,308.0,381.0,84544.0,425646.0,2.125677,286964.278008,0.199696,Adams County-Wisconsin
1304470,WIDNR_WQX,,2024-01-29,0.0226,mg/L,43.953910,-88.952560,,1758447,POINT (-88.95256 43.95391),...,168614.0,380.13,502.0,252.0,99674.0,180513.0,1.497150,197190.068599,0.110108,Green Lake County-Wisconsin


In [48]:



desired_columns = ['ActivityStartDate', 'ResultMeasureValue', 'CountyState', 'ActivityLocation/LatitudeMeasure','ActivityLocation/LongitudeMeasure']
modeling = data_usgs_GL.loc[:,desired_columns]

# convert to year
modeling['ActivityStartDate'] = pd.to_datetime(modeling['ActivityStartDate'])
modeling['ActivityYear'] = modeling['ActivityStartDate'].dt.year

#drop the full date
modeling.drop('ActivityStartDate', axis=1, inplace=True)

# finding mean by county and year, not resetting index because want to sew back together after imputation
modeling = modeling.groupby(['CountyState', 'ActivityYear']).agg({'ResultMeasureValue': 'mean', 'ActivityLocation/LatitudeMeasure':'mean','ActivityLocation/LongitudeMeasure':'mean'})
# modeling = modeling.groupby(['CountyName', 'Year']).agg({'ResultMeasureValue': 'mean'}).reset_index()
modeling

Unnamed: 0_level_0,Unnamed: 1_level_0,ResultMeasureValue,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure
CountyState,ActivityYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adams County-Illinois,2010,0.212773,40.055899,-91.371083
Adams County-Illinois,2011,0.083278,40.007116,-91.242255
Adams County-Illinois,2012,0.079962,40.069261,-91.365847
Adams County-Illinois,2013,0.088600,39.970719,-91.140114
Adams County-Illinois,2014,0.184000,40.023338,-91.334644
...,...,...,...,...
York County-Pennsylvania,2018,0.109343,40.035358,-76.678542
York County-Pennsylvania,2019,0.091269,40.014245,-76.630262
York County-Pennsylvania,2020,0.124969,40.034533,-76.638578
York County-Pennsylvania,2021,0.081930,40.033449,-76.640127


In [43]:
modeling

Unnamed: 0_level_0,Unnamed: 1_level_0,ResultMeasureValue,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure
CountyName,ActivityYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adams County,2010,0.217240,40.136769,-90.594442
Adams County,2011,0.070196,39.954160,-87.093354
Adams County,2012,0.167473,39.956151,-82.143727
Adams County,2013,0.088600,39.970719,-91.140114
Adams County,2014,0.169891,40.102348,-90.394133
...,...,...,...,...
York County,2018,0.109343,40.035358,-76.678542
York County,2019,0.091269,40.014245,-76.630262
York County,2020,0.124969,40.034533,-76.638578
York County,2021,0.081930,40.033449,-76.640127


In [40]:
modeling_pivot = modeling.pivot_table(index='CountyName', columns='ActivityYear', values='ResultMeasureValue', aggfunc='mean')
modeling_pivot


Unnamed: 0_level_0,Unnamed: 1_level_0,ResultMeasureValue
CountyName,ActivityYear,Unnamed: 2_level_1
Adams County,2010,0.217240
Adams County,2011,0.070196
Adams County,2012,0.167473
Adams County,2013,0.088600
Adams County,2014,0.169891
...,...,...
York County,2018,0.109343
York County,2019,0.091269
York County,2020,0.124969
York County,2021,0.081930


In [None]:
modeling.isna().sum()


# Imputation using KNN

In [53]:
import numpy as np

from sklearn.impute import KNNImputer
X = modeling
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

array([[ 2.12772727e-01,  4.00558991e+01, -9.13710827e+01],
       [ 8.32777778e-02,  4.00071160e+01, -9.12422545e+01],
       [ 7.99615385e-02,  4.00692608e+01, -9.13658469e+01],
       ...,
       [ 1.24968553e-01,  4.00345333e+01, -7.66385780e+01],
       [ 8.19303797e-02,  4.00334494e+01, -7.66401266e+01],
       [ 9.20442478e-02,  4.00492301e+01, -7.66582195e+01]])

In [54]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,ResultMeasureValue,ActivityLocation/LatitudeMeasure,ActivityLocation/LongitudeMeasure
CountyState,ActivityYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adams County-Illinois,2010,0.212773,40.055899,-91.371083
Adams County-Illinois,2011,0.083278,40.007116,-91.242255
Adams County-Illinois,2012,0.079962,40.069261,-91.365847
Adams County-Illinois,2013,0.088600,39.970719,-91.140114
Adams County-Illinois,2014,0.184000,40.023338,-91.334644
...,...,...,...,...
York County-Pennsylvania,2018,0.109343,40.035358,-76.678542
York County-Pennsylvania,2019,0.091269,40.014245,-76.630262
York County-Pennsylvania,2020,0.124969,40.034533,-76.638578
York County-Pennsylvania,2021,0.081930,40.033449,-76.640127
