In [None]:
# The purpose of this program is to use the data "close" to the missing values
# to make predictions of what the Total Nitrogen and Total Phosphorus levels are

In [2]:
# Packages
import pickle
import pandas as pd
from geopy import distance
pd.set_option('display.max_columns', None)

In [3]:
# Classes
# This class stores the latitude and longitude of a sample, and indicates 
# if this location has the desired variable we are estimating
class Location:
    def __init__(self,latitude,longitude,hasv,ID,value):
        self.ID = ID
        self.latitude = latitude
        self.longitude = longitude
        self.hasv = hasv
        self.value = value
        
    def __str__(self):
        return str(self.ID)

# Calculates the distance between 2 samples in km
def getdist(S1,S2):
    # radius of earth in km
    coords_1 = (S1.latitude, S1.longitude)
    coords_2 = (S2.latitude, S2.longitude)
    dist = distance.distance(coords_1, coords_2).km
    return dist

# filters out data if a point is missing in one of the colunns
def filterblanks(columns,data,blank):
    # if blank is true, rows with blanks in these columns
    # if blank is false, remove rows with non blanks or non zeros in these columns
    for c in columns:
        if blank:
            data = data[data[c].notnull()]
        else:
            data = data[data[c].isnull()]
    return data

# PRE: all locations in the dataframe are
# unique
def DistanceMatrix(dataframe,variable):
    # the list of location objects
    locations = []
    # the list of indexes where the the row is located in the dataframe
    #indexes = []
    for index,row in dataframe.iterrows():
        # make a location object on this row
        if pd.isnull(row[variable]):
            hasv = False
        else:
            hasv = True
        locations.append(Location(row["LATITUDE"],row["LONGITUDE"],hasv,row["LOCATCD"],row[variable]))
        #indexes.append(index)
        
    matrix = pd.DataFrame(0,index=locations,columns=locations)
    for ci,column in enumerate(locations):
        for ri,row in enumerate(locations):
            if ri>ci:
                # compute distance between column and row
                dist = getdist(row,column)
            elif ci>ri:
                dist = matrix.iloc[ci,ri]
            # put this distance in the dataframe
            else:
                continue
            matrix.iloc[ri,ci] = dist
    return matrix

def changeVar(DM,data,variable):
    locations = DM.index
    # loop through each location
    for i,loc in enumerate(locations):
        ID = loc.ID
        row = data[data["LOCATCD"]==ID]
        if pd.isnull(row[variable]):
            locations[i].hasv = False
            locations[i].value = None
        else:
            locations[i].hasv = True
            locations[i].value = row[variable]
            
    DM.index = locations
    DM.columns = locations
        
def getclosest(numclosest,distancematrix,location):
    # Make a set of the closest locations that contain variable
    closest = {}
    column = distancematrix.loc[:,location].copy()
    #print(type(distancematrix.index[0]))
    # Filter the locations that dont have the desired variable
    doesnthavev = []
    for i in range(len(column)):
        if not column.index[i].hasv:
            doesnthavev.append(column.index[i])
    column.drop(doesnthavev,inplace = True)
    #print(type(column))
    column.sort_values(inplace = True)
    # The current location wouldnt be in column because
    # it doesnt have the variable
    
    return column.iloc[0:numclosest]

# Key: Location Code
# Value: List of tuples (locatcd,distance,value)
def makeDict(data,variable,numclosest=2):
    D = DistanceMatrix(data,variable)
    # Loop through each location without a value for variable
    closestDict = {}
    for loc in D.columns:
        if not loc.hasv:
            # Get the closest locations to loc
            closest = getclosest(numclosest,D,loc)
            # The list of tuples that contain location id, the distance, and the value for variable
            tuples = []
            for i,dist in enumerate(closest):
                ID = closest.index[i].ID
                val = closest.index[i].value
                tuples.append((ID,dist,val))
            closestDict[loc.ID] = tuples
    return closestDict

def predict(tuples,numclosest = 2):
    loc2 = tuples[0]
    loc3 = tuples[1]
    d12 = loc2[1]
    val2 = loc2[2]
    d13 = loc3[1]
    val3 = loc3[2]
    
    c2 = d12/(d12+d13)
    c3 = d13/(d12+d13)
    
    predicted = c2*val2+c3*val3
    
    return predicted

# NEEDS WORK
def addpredictions(df,variables,numclosest):
    # make prediction and insert for each variable
    first = True
    for var in variables:
        if first:
            DM = DistanceMatrix(df,var)
        else:
            changeVar(DM,df,var)
            
        for loc in DM.columns:
            if not loc.hasv:
                # Get the closest locations to loc
                closest = getclosest(numclosest,DM,loc)
                # The list of tuples that contain location id, the distance, and the value for variable
                tuples = []
                for i,dist in enumerate(closest):
                    ID = closest.index[i].ID
                    val = closest.index[i].value
                    tuples.append((ID,dist,val))
                closestDict[loc.ID] = tuples
                
def run():
    DictTN = makeDict(data,"TN")
    DictTP = makeDict(data,"TP")
    #put in predicted TN
    data["PredictedTN"] = 0
    for index,row in data.iterrows():
        if pd.isnull(row["TN"]):
            data.loc[index,"PredictedTN"] = predict(DictTN[row["LOCATCD"]])
        else:
            data.loc[index,"PredictedTN"] = row["TN"]

    data["PredictedTP"] = 0
    for index,row in data.iterrows():
        if pd.isnull(row["TP"]):
            data.loc[index,"PredictedTP"] = predict(DictTP[row["LOCATCD"]])
        else:
            data.loc[index,"PredictedTP"] = row["TP"]   
        
    print(data.shape)
    print("Filtering out points with blank entries in at least one of the columns")
    cols = ['PredictedTN','PredictedTP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
    #for col in cols:
    #    print(col,data[col].isna().sum())
    qualdata_prediction = filterblanks(cols,data,True)
    print(qualdata_prediction.shape)

In [17]:
# Load the data
data = pd.read_csv("../pool data/ltrm_water_data_lat_long.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
# Filter out the columns we want
data.drop(data.columns.difference(['SHEETBAR','TN','TP','TPQF','TNQF','SS','SSQF',
                                         'TURB','TURBQF','WDP',
                                         'TEMP','TEMPQF','DO','DOQF','COND',
                                         'CONDQF','VEL','VELQF','FLDEAST',
                                         'FLDNORTH','PROJCD','FLDNUM','DATE',
                                  'LOCATCD','STRATUM','CHLcal','SECCHI','SECCHIQF','LATITUDE','LONGITUDE']), 1, inplace=True)

In [16]:
# Filter just the pool 4 upper
data = data[(data.FLDNUM == 1)]
data = data[(data.LONGITUDE > - 92.2)]
data = data[(data.STRATUM != "4")]

In [14]:
# Add a year column
data["YEAR"] = pd.DatetimeIndex(data["DATE"]).year
# Add a time code
data["TIME CODE"] = data["LOCATCD"].astype(str).apply(lambda x: x[3])

In [None]:
if ()