In [None]:
# The purpose of this program is to combine any rows that have the same barcode (SHEETBAR). While filtering by the categorical river depth (CALCZCD) removed many duplicate barcodes that consisted of samples at the same site with varying CALCZCD values, there are still several samples that have the same barcode.

# It is important that our data sets consists only of unqiue barcodes because it removes issues when interpolating the data and predicting our missing continuous variable values. For example, if two samples have the same barcode, then their latitude and longitude points are going to be equal. Say one of these two records has a missing TP value. When the interpolation algorithm tries to predict the missing TP value, it will search for the closest samples and create a new TP value as a linear combination of those closets points. Since we have two identcal lat and long points, the distance will be 0 and thus, by definition of our weights in the linear combination, we will be diving by 0.

In [124]:
# Import libraries
import pandas as pd
from geopy import distance
pd.set_option('display.max_columns', None)
import numpy as np

In [125]:
# The data at this point will have already been filtered by its QF code and by its CALCZCD
# The data set should consist of only surface level points 
data = pd.read_csv("../pool data/water_data_filtered.csv")


In [135]:
# Filter out the continous variables that we want
# Wont need this step since the columns were filtered earlier
data = data[['SHEETBAR','TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI', 'LONGITUDE', 'LATITUDE', 'DATE', 'FLDNUM', 'LOCATCD', 'STRATUM']]

In [136]:
# Determine which columns are duplicates and count how many duplicates there are
duplicates = data.groupby(['SHEETBAR']).size()

In [137]:
# Reset index of duplicates and rename the column to "count" 
duplicates = pd.DataFrame(duplicates, columns = ['count']).reset_index()
duplicates = duplicates[duplicates['count'] > 1]

In [138]:
# Create empty dataframe that will store the new collapsed data set
collapsed_data = pd.DataFrame()

In [139]:
# Since we only need to examine the data that has duplicate barcodes, we can set aside the unique barcodes
collapsed_data = data[-pd.Series(data["SHEETBAR"]).isin(duplicates["SHEETBAR"])]

# Store the duplicated data in its own data set called data_dups
data_dups = data[pd.Series(data["SHEETBAR"]).isin(duplicates["SHEETBAR"])]


In [140]:
# We know based on inspection that there may be negative TP values. We are going to remove these samples
data_dups = data_dups.drop(data_dups.index[data_dups['TP'] < 0])

In [141]:
# Find the average of each variable by barcode
# If there is only na values, na will be returned
# For any number of values per variable, the average will be found
temp = data_dups.groupby(['SHEETBAR'], as_index = False).mean()

In [142]:
# Create a separate dataframe that stores the date of each barcode 
# Drops duplicates so that we will only have one date per barcode
identifiers = pd.DataFrame(data_dups, columns = ['SHEETBAR','DATE', 'STRATUM', 'LOCATCD']).drop_duplicates()

In [143]:
# Adds the date to the collapsed data by the sheetbar
temp = temp.merge(identifiers, on = ['SHEETBAR'])

In [144]:
# Add the combined duplicate samples to the single row samples
collapsed_data.append(temp)

Unnamed: 0,SHEETBAR,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI,LONGITUDE,LATITUDE,DATE,FLDNUM,LOCATCD,STRATUM,STRATUM_x,STRATUM_y
0,41004753,2.897000,0.121000,0.1,11.600000,4.0,490.000000,,2.900000,7.82,,119.0,-92.226798,44.447461,12/30/1996,1,M771.2P,,,
1,41004755,3.054000,0.112000,0.1,11.900000,4.0,484.000000,,2.400000,5.50,,187.0,-92.100458,44.412100,12/30/1996,1,M764.3A,,,
2,41004757,1.282000,0.066000,0.1,11.000000,4.0,175.000000,,2.400000,0.71,,89.0,-92.084185,44.410152,12/30/1996,1,CH00.1M,,,
3,41004761,3.266000,0.115000,0.1,11.700000,3.0,478.000000,,2.200000,5.33,,123.0,-92.134004,44.423271,12/30/1996,1,M766.0I,,,
4,41004763,2.908000,0.105000,0.1,10.600000,3.0,446.000000,0.07,1.800000,1.10,,119.0,-91.932921,44.326994,12/30/1996,1,M753.1X,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59149,47001268,1.430000,0.094000,23.4,11.000000,16.0,280.000000,,28.000000,,74.572080,,-91.049424,43.000181,06/16/2006,7,WS05.0Y,,,
59150,47001269,8.620000,0.193000,17.3,8.800000,28.0,727.000000,,24.300000,,3.917360,,-91.197341,43.093663,06/16/2006,7,YL01.5Y,,,
0,46015274,1.599000,0.664000,24.5,6.200000,140.0,461.000000,0.01,164.900000,0.57,113.298125,13.0,-90.103645,40.253465,08/07/2009,6,0962021,,3.0,3.0
1,47000506,1.705333,0.116667,10.4,12.166667,11.0,353.666667,0.05,14.933333,,12.040600,45.0,-91.205630,43.370779,10/29/1999,7,M663.4E,,,


In [145]:
collapsed_data.to_csv("../pool data/cleaned_data.csv", index = False)