In [42]:
import pickle
import pandas as pd
from geopy import distance
pd.set_option('display.max_columns', None)
# useful functions and classes

# This class stores the latitude and longitude of a sample, and indicates 
# if this location has the desired variable we are estimating
class Location:
    def __init__(self,latitude,longitude,hasv,ID,value):
        self.ID = ID
        self.latitude = latitude
        self.longitude = longitude
        self.hasv = hasv
        self.value = value
        
    def __str__(self):
        return str(self.ID)

# Calculates the distance between 2 samples in km
def getdist(S1,S2):
    # radius of earth in km
    coords_1 = (S1.latitude, S1.longitude)
    coords_2 = (S2.latitude, S2.longitude)
    dist = distance.distance(coords_1, coords_2).km
    return dist

# filters out data if a point is missing in one of the colunns
def filterblanks(columns,data,blank):
    # if blank is true, rows with blanks in these columns
    # if blank is false, remove rows with non blanks or non zeros in these columns
    for c in columns:
        if blank:
            data = data[data[c].notnull()]
        else:
            data = data[data[c].isnull()]
    return data

# PRE: all locations in the dataframe are
# unique
def DistanceMatrix(dataframe,variable):
    # the list of location objects
    locations = []
    # the list of indexes where the the row is located in the dataframe
    #indexes = []
    for index,row in dataframe.iterrows():
        # make a location object on this row
        if pd.isnull(row[variable]):
            hasv = False
        else:
            hasv = True
        locations.append(Location(row["LATITUDE"],row["LONGITUDE"],hasv,row["LOCATCD"],row[variable]))
        #indexes.append(index)
        
    matrix = pd.DataFrame(0,index=locations,columns=locations)
    for ci,column in enumerate(locations):
        for ri,row in enumerate(locations):
            if ri>ci:
                # compute distance between column and row
                dist = getdist(row,column)
            elif ci>ri:
                dist = matrix.iloc[ci,ri]
            # put this distance in the dataframe
            else:
                continue
            matrix.iloc[ri,ci] = dist
    return matrix

def changeVar(DM,data,variable):
    locations = DM.index
    # loop through each location
    for i,loc in enumerate(locations):
        ID = loc.ID
        row = data[data["LOCATCD"]==ID]
        if pd.isnull(row[variable]):
            locations[i].hasv = False
            locations[i].value = None
        else:
            locations[i].hasv = True
            locations[i].value = row[variable]
            
    DM.index = locations
    DM.columns = locations
        
def getclosest(numclosest,distancematrix,location):
    # Make a set of the closest locations that contain variable
    closest = {}
    column = distancematrix.loc[:,location].copy()
    #print(type(distancematrix.index[0]))
    # Filter the locations that dont have the desired variable
    doesnthavev = []
    for i in range(len(column)):
        if not column.index[i].hasv:
            doesnthavev.append(column.index[i])
    column.drop(doesnthavev,inplace = True)
    #print(type(column))
    column.sort_values(inplace = True)
    # The current location wouldnt be in column because
    # it doesnt have the variable
    
    return column.iloc[0:numclosest]

# Key: Location Code
# Value: List of tuples (locatcd,distance,value)
def makeDict(data,variable,numclosest=2):
    D = DistanceMatrix(data,variable)
    # Loop through each location without a value for variable
    closestDict = {}
    for loc in D.columns:
        if not loc.hasv:
            # Get the closest locations to loc
            closest = getclosest(numclosest,D,loc)
            # The list of tuples that contain location id, the distance, and the value for variable
            tuples = []
            for i,dist in enumerate(closest):
                ID = closest.index[i].ID
                val = closest.index[i].value
                tuples.append((ID,dist,val))
            closestDict[loc.ID] = tuples
    return closestDict

def predict(tuples,numclosest = 2):
    loc2 = tuples[0]
    loc3 = tuples[1]
    d12 = loc2[1]
    val2 = loc2[2]
    d13 = loc3[1]
    val3 = loc3[2]
    
    c2 = d12/(d12+d13)
    c3 = d13/(d12+d13)
    
    predicted = c2*val2+c3*val3
    
    return predicted

# NEEDS WORK
def addpredictions(df,variables,numclosest):
    # make prediction and insert for each variable
    first = True
    for var in variables:
        if first:
            DM = DistanceMatrix(df,var)
        else:
            changeVar(DM,df,var)
            
        for loc in DM.columns:
            if not loc.hasv:
                # Get the closest locations to loc
                closest = getclosest(numclosest,DM,loc)
                # The list of tuples that contain location id, the distance, and the value for variable
                tuples = []
                for i,dist in enumerate(closest):
                    ID = closest.index[i].ID
                    val = closest.index[i].value
                    tuples.append((ID,dist,val))
                closestDict[loc.ID] = tuples
                
def run():
    DictTN = makeDict(data,"TN")
    DictTP = makeDict(data,"TP")
    #put in predicted TN
    data["PredictedTN"] = 0
    for index,row in data.iterrows():
        if pd.isnull(row["TN"]):
            data.loc[index,"PredictedTN"] = predict(DictTN[row["LOCATCD"]])
        else:
            data.loc[index,"PredictedTN"] = row["TN"]

    data["PredictedTP"] = 0
    for index,row in data.iterrows():
        if pd.isnull(row["TP"]):
            data.loc[index,"PredictedTP"] = predict(DictTP[row["LOCATCD"]])
        else:
            data.loc[index,"PredictedTP"] = row["TP"]   
        
    print(data.shape)
    print("Filtering out points with blank entries in at least one of the columns")
    cols = ['PredictedTN','PredictedTP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
    #for col in cols:
    #    print(col,data[col].isna().sum())
    qualdata_prediction = filterblanks(cols,data,True)
    print(qualdata_prediction.shape)

    
print("Success")

Success


In [14]:
# FILE PATH FOR ORIGINAL LTRM WATER DATA
ltrm_water_path =  r"C:\Users\forre\Desktop\REU\TDA\Data\ltrm_water_data_lat_long_06072021.csv"
df = pd.read_csv(ltrm_water_path, low_memory = False)
df['YEAR'] = pd.DatetimeIndex(df['DATE']).year

# Pool Names
df.loc[df['FLDNUM'] == 1, 'POOL_NAME'] = 'Pool 4'
df.loc[df['FLDNUM'] == 2, 'POOL_NAME'] = 'Pool 8'
df.loc[df['FLDNUM'] == 3, 'POOL_NAME'] = 'Pool 13'
df.loc[df['FLDNUM'] == 4, 'POOL_NAME'] = 'Pool 26'
df.loc[df['FLDNUM'] == 5, 'POOL_NAME'] = 'Open River'
df.loc[df['FLDNUM'] == 6, 'POOL_NAME'] = 'LaGrange'
df.loc[df['FLDNUM'] == 7, 'POOL_NAME'] = 'Pool 9'
# Stratum Type
df.loc[df['STRATUM'] == 1, 'STRATUM_NAME'] = 'Main Channel'
df.loc[df['STRATUM'] == 2, 'STRATUM_NAME'] = 'Side Channel'
df.loc[df['STRATUM'] == 3, 'STRATUM_NAME'] = 'Backwater Area'
df.loc[df['STRATUM'] == 4, 'STRATUM_NAME'] = 'Pepin/Swan Lake'
df.loc[df['STRATUM'] == 5, 'STRATUM_NAME'] = 'Impounded'
df.loc[df['STRATUM'] == 6, 'STRATUM_NAME'] = 'Isolated'
df.loc[df['STRATUM'] == 7, 'STRATUM_NAME'] = 'New Terrestrial'
df.loc[df['STRATUM'] == 9, 'STRATUM_NAME'] = 'Pool 13'

print("done")

done


In [15]:
print(data["TP"].isna().sum())

129205


In [41]:
data.drop(data.columns.difference(['SHEETBAR','TN','TP','TPQF','TNQF','SS','SSQF',
                                         'TURB','TURBQF','WDP',
                                         'TEMP','TEMPQF','DO','DOQF','COND',
                                         'CONDQF','VEL','VELQF','FLDEAST',
                                         'FLDNORTH','PROJCD','FLDNUM','DATE',
                                  'LOCATCD','STRATUM','CHLcal','SECCHI','SECCHIQF','LATITUDE','LONGITUDE']), 1, inplace=True)
print("After filtering columns: ",data.shape)
data = data[(data.PROJCD == "M-")]
print("After filtering sampling design: ",data.shape)
data = data[(data.FLDNUM == 4)]
print("After filtering Pool 26: ",data.shape)
print("Now adding a year column")
data["YEAR"] = pd.DatetimeIndex(data["DATE"]).year
print(data.shape)
print("Adding a timecode column")
data["TIME CODE"] = data["LOCATCD"].astype(str).apply(lambda x: x[3])
print(data.shape)
print("Filtering by backwater lakes")
data = data[data.STRATUM == 3]
print(data.shape)
#print("Filtering by summer")
#data = data[data["TIME CODE"] == '2']
#print(data.shape)
print("Dropping data with SSQF=8 or 64")
qualdata = data[(data["SSQF"]!=8)&(data["SSQF"]!=64)]
print(qualdata.shape)
print("Dropping all blank columns")
qualdata.drop(['PROJCD','FLDEAST','FLDNORTH','TPQF','TNQF','SSQF','TURBQF','TEMPQF','DOQF',
                                        'CONDQF','VELQF','SECCHIQF'], 1, inplace=True)
print(qualdata.shape)
print("Filtering out points with blank entries in columns other than TP and TN")
f_cols = ['SS','CHLcal']
s_cols = ['VEL','TEMP']
all_cols = ['TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
qualdata_noprediction = filterblanks(all_cols,qualdata,True)
print(qualdata_noprediction.shape)

After filtering columns:  (0, 30)
After filtering sampling design:  (0, 30)
After filtering Pool 26:  (0, 30)
Now adding a year column
(0, 31)
Adding a timecode column
(0, 32)
Filtering by backwater lakes
(0, 32)
Dropping data with SSQF=8 or 64
(0, 32)
Dropping all blank columns
(0, 20)
Filtering out points with blank entries in columns other than TP and TN
(0, 20)


In [39]:
qualdata_noprediction.columns

Index(['SHEETBAR', 'FLDNUM', 'DATE', 'LOCATCD', 'WDP', 'SECCHI', 'STRATUM',
       'TEMP', 'DO', 'TURB', 'COND', 'VEL', 'TP', 'TN', 'SS', 'CHLcal',
       'LATITUDE', 'LONGITUDE', 'YEAR', 'TIME CODE'],
      dtype='object')

In [40]:
print("Building a new dataframe with predicted TP and TN values")
#s = qualdata_noprediction["LOCATCD"].duplicated(keep=False)
# get the years and timecodes for this dataset
#predictions can only be made if the point is in the same year and time code
years = qualdata_noprediction["YEAR"].unique()
timecodes = qualdata_noprediction["TIME CODE"].unique()
qualdata_prediction = pd.DataFrame()
for year in years:
    for timecode in timecodes:
        print("Appending predicted data for ",year," timecode ",timecode)
        # curset is the current set of rows we are predicting for
        curset = qualdata_noprediction[qualdata_noprediction["YEAR"]==year]
        curset = curset[curset["TIME CODE"]==timecode]
        curset["PredictedTN"] = 0
        curset["PredictedTP"] = 0
        print(curset.shape)
        DictTN = makeDict(curset,"TN")
        DictTP = makeDict(curset,"TP")
        #check to see if there are valid locations
        # that can be used to predict
        bad = bool((curset["TN"].isnull().sum()>(curset.shape[0]-2))|(curset["TP"].isnull().sum()>(curset.shape[0]-2)))
        print(curset["TN"].isnull().sum(),curset["TP"].isnull().sum())
        if(bad):
            print("Less than 2 locations have the variables in this set, dropping rows without variable")
            curset = curset[(curset["TP"].notnull())&(curset["TN"].notnull())]
            curset["PredictedTN"] = curset["TN"]
            curset["PredictedTP"] = curset["TP"]
            print("Cur set is now ",curset.shape)
        else:
            #put in predicted TN
            for index,row in curset.iterrows():
                if pd.isnull(row["TN"]):
                    #print("Predicting ",row["LOCATCD"])
                    curset.loc[index,"PredictedTN"] = predict(DictTN[row["LOCATCD"]])
                else:
                    curset.loc[index,"PredictedTN"] = row["TN"]
            #put in predicted TP
            for index,row in curset.iterrows():
                if pd.isnull(row["TP"]):
                    curset.loc[index,"PredictedTP"] = predict(DictTP[row["LOCATCD"]])
                else:
                    curset.loc[index,"PredictedTP"] = row["TP"] 
                    
        qualdata_prediction = qualdata_prediction.append(curset,ignore_index=True)  
print("Final data set size is ",qualdata_prediction.shape)

Building a new dataframe with predicted TP and TN values
Final data set size is  (0, 0)


In [37]:
qualdata_prediction.drop(qualdata_prediction.columns.difference(['TEMP','VEL','PredictedTN']), 1, inplace=True)
print("done")

AttributeError: 'pandas._libs.properties.AxisProperty' object has no attribute 'difference'

In [20]:
qualdata_prediction.isna().sum()

TEMP           0
VEL            0
PredictedTN    0
dtype: int64

In [24]:

qualdata_prediction.to_excel(r"C:\Users\forre\Desktop\REU\TDA\Data\Predicted_allyear_backwater_Barcode.xlsx")
print("done")

done


In [25]:
qualdata_prediction.shape

(5670, 3)

In [26]:
data["TN"].min(),data["TN"].max()

(0.185, 46.989)

In [30]:
qualdata_prediction = pd.DataFrame
print("done")

done


In [33]:
cols = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
pure = filterblanks(cols,data,True)
pure.shape

(2219, 32)

TypeError: 'type' object is not subscriptable