# Imports
This the script for prepossing the data to create data ready for the TDA mapper algorithm.

In [None]:
import sys
import pandas as pd
from geopy import distance
import math
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.preprocessing import PolynomialFeatures
print("imports done")
pd.set_option("display.max_rows", None)

# File
From the resulting `R` script written by Amber and `python` script written by Alaina, grab the cleaned data file. The `R` file can be found in the github repository and is called `water cleaning data.Rmd` and is in the WaterCleaning folder. Secondly, run it through the `python` script called `Data_Collapse.ipynb`. From here, this will result in the `cleaned_data.csv` file. This can found in the github repo as well. Download it, and edit the file path if necessary to read in the dataframe.

In [None]:
filePath = r"..\LTRM data\water_data_qfneg.csv"
dataFrame = pd.read_csv(filePath, low_memory = False)
print("dataFrame Made")

# Filter for your pool
Resets the index as well. This is done through the `FLDNUM` parameter, and can be found in accompanying documentation for the correct pool number. 

In [None]:
dataFrame = dataFrame[dataFrame['FLDNUM'] == 4]
print(dataFrame.shape)
dataFrame  = dataFrame.reset_index(drop = True)
dataFrame.head()

# Functions
Here, we interpolate for missing data values. These occur when the data set has a missing value. The way it is computed utilizes a $k$-nearest neighbors algorithm. A weighted average using the $k$ nearest points is used to compute the missing value, and it appends it to a new column in the data set called `"PREDICTED_" + variable`, where `variable` is what we wish to interpolate (`TN` or `TP`) for example.

In [None]:
"""
Params:
df = the dataframe filtered for the pool
hashtable = the hash table of distances of each point for the data frame. (Created from construct_hashtable)
naVar = the variable we wish to interpolate
year = the year we wish to predict for
k = the number of terms in the weighted average for interpolation

NOTE: for now, set k = 2 due to potential bug for larger k

This is one of two predict functions. 
Here, multiple years worth of data can put in, and only the specified year will be predicted
and added to the dataframe. Note that this function will find the k nearnest neighbors using df,
regardless of year.
"""
def predict_years(df, hashtable, naVar, year, k):
    df["PREDICTED_" + naVar] = df[naVar]
    df_year = df.copy()
    df_year = df_year[df_year["YEAR"] == year]
    naIndices = df_year[(df_year[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")
"""
Params:
df = the dataframe filtered for the pool
hashtable = the hash table of distances of each point for the data frame.
naVar = the variable we wish to interpolate
k = the number of terms in the weighted average for interpolation

NOTE: for now, keep k = 2 due to potential bug for k > 2

This predict function is more crude than predict_years. It will predict using missing values of naVar for 
the entire dataframe, using the entire dataframe to locate the k nearnest neighbors.
"""
def predict(df, hashtable, naVar, k):
    df["PREDICTED_" + naVar] = df[naVar]
    naIndices = df[(df[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

    
"""
params:

minimums = lower bound of data, typically latitude or longitude
maximums = upper bound of data, typically latitude or longitude
x = data we wish to transfrom in to [0,1], typically a latitude or a longitude

This is a helper function for construct_hashtable and k_nearest_neighbors. It takes in a latitude or
a longitude and maps to in to [0,1] so the point can be plotted properly in the hashtable
"""    
    
def transform(minimum, maximum, x):
    return (1 / (maximum - minimum) ) * (x - minimum)
"""
Params:
point 1 = First point (latitude and longitude)
point 2 = Second point (latitude and longitude)

Returns the distance in kilometers between two points in space, using
scipy distance function.
"""
def dist(point1, point2):
    return distance.distance(point1, point2).km
"""
params:
df = the dataframe

returns: hashtable (list of lists of lists of tuples(index, latitude, longitude))

Constructs a hash table of locations of points (the position where data is recorded)
This is used in the k nearest neighbors algorithm. Locations that are near each other in space
are near each other in the hashtable

"""    
def construct_hashtable(df):
    #get hashtable information
    data_length = math.sqrt(df.shape[0])
    #print("data_length: " + str(data_length))
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    #construct hashtable
    hashtable = [[[] for x in range(int(data_length)+1)] for y in range(int(data_length)+1)]
    
    #populate hashtable
    for index, row in df.iterrows():
        r_lat = row['LATITUDE']
        r_long = row['LONGITUDE']
        lat = math.floor(transform(lat_minimum, lat_maximum, r_lat) / interval_length)
        long = math.floor(transform(long_minimum, long_maximum, r_long) / interval_length)
        #print("lat: " + str(lat))
        #print("long: " + str(long))
        hashtable[lat][long].append((index, r_lat, r_long))

    return hashtable
"""
Params:
df = dataframe
index = index of variable we wish to find k nearest neighbors of
naVar = variable to predict
hashtable = data structure created from construct_hashtable
k = number of nearest neighbors

Returns: (distances, indices) of k nearnest neighbors

This algorithm will find the k nearest neighbors of a desired point using the hashtable, if possible.
If there are no valid points near the given point, then the algorithm will use brute force
"""
def k_nearest_neighbors(df, index, naVar, hashtable, k):

    distances = []
    neighbor_indices = []
    neighbors = {}
    
    data_length = math.sqrt(df.shape[0])
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    
    row_na = df.loc[index]
    point_na = (row_na['LATITUDE'], row_na['LONGITUDE'])
    lat = math.floor(transform(lat_minimum, lat_maximum, point_na[0]) / interval_length)
    long = math.floor(transform(long_minimum, long_maximum, point_na[1]) / interval_length)
    season = row_na['SEASON']
    
    for inx, latitude, longitude in hashtable[lat][long]:
        distance_km = dist(point_na, (latitude, longitude))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    
    if lat != 0:
        
        for inx, latitude, longitude in hashtable[lat - 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat - 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat - 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if lat + 1 != len(hashtable):
        
        for inx, latitude, longitude in hashtable[lat + 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat + 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat + 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if long != 0:
        for inx, latitude, longitude in hashtable[lat][long - 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
        
    if long + 1 != len(hashtable):
        for inx, latitude, longitude in hashtable[lat][long + 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
                    
    #Possible bug with neighbor dictionary for k > 2
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    if len(neighbor_indices) < k and len(neighbor_indices) != 0:
        print("INTERPOLATING WITH " + str(len(neighbor_indices)) + " POINTS INSTEAD OF " + str(k) + " POINTS")
    if len(neighbor_indices) >= 2:
        return (distances, neighbor_indices)
    
    #Possible bug with neighbor dictionary for k > 2
    distances = []
    neighbors = {}
    neighbor_indices = []
    for inx, row in df.iterrows():
        distance_km = dist(point_na, (row['LATITUDE'], row['LONGITUDE']))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    return (distances, neighbor_indices)            
    
"""
Params:
df = dataframe
distances = distances of the points being used to interpolate for the missing value 
neighbors  = the points for prediction
naVar = variable to predict

Returns the interpolated data value for a missing datapoint.
"""
def interpolate(df, distances, neighbors, naVar):
    result = 0
    denominator = [1 / x for x in distances]
    denominator = sum(denominator)
    for i in range(len(distances)):
        result += ((1/distances[i]) / denominator) * df.loc[neighbors[i]][naVar]
    return result
print("Functions have been loaded")

# Season by Season Interpolation
Here, we begin the process of interpolating for missing data based upon the season. To do this, the dataframe we input needs a season column. To obtain this, we create a copy of the dataframe, and use this copy throughout the rest of the work. To obtain the season, we utilize the date recorded to get the season. We create a column for the particular month, and then use a dictionary to replace that value with the appropriate season.

In [None]:
newData = dataFrame.copy()
newData["MONTH"] = pd.DatetimeIndex(dataFrame["DATE"]).month
newData["SEASON"] = newData["MONTH"]
seasons = {3 : 'SPRING',
           4 : 'SPRING',
           5 : 'SPRING',
           6 : 'SUMMER',
           7 : 'SUMMER',
           8 : 'SUMMER',
           9 : 'FALL',
           10 : 'FALL',
           11: 'FALL',
           12: 'WINTER',
           1: 'WINTER',
           2: 'WINTER'}
newData = newData.replace({"SEASON" : seasons})
# for index, row in newData.iterrows():
#     newData.loc[index, 'SEASON'] = seasons
newData.head()

# Prediction Group by Season and several years

Here, we interpolate missing data values for a certain year, grouping data by season. For example, we can use spring data from 2001, 2002, and 2003 to predict spring data for 2002. In the list below, `x` represents the year we predict. Modify the lower bound to be the earliest year of data that you have.

Then, we create a year column on the data frame to allow the predict function to get the correct year for prediction purposes. Then, we interpolate data values year by year, season by season. After it is done, it sends the output to a `.csv` file, so modify the path as necessary. The `if (seasonFrame.shape[0] > 1)` is a check to make sure there is enough data present to do any interpolating, since we use `2` points to predict a missing third here.


In [None]:
continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
years = [[x-1, x, x+1] for x in range(1995, 2021)]
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
result = pd.DataFrame()
for setOfYears in years:
    print("Set of years: " + str(setOfYears))
    print("Year to interpolate missing data: " + str(setOfYears[1]))
    threeYearFrame = newData[newData['YEAR'].isin(setOfYears)]
    seasons = ['SPRING','SUMMER','FALL','WINTER']
    for season in seasons:
        print(season)
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            for var in continuous:
                predict_years(seasonalFrame, seasonalHash, var,setOfYears[1],2)
            
            yearToAdd = seasonalFrame[seasonalFrame['YEAR'] == setOfYears[1]]
            result = result.append(yearToAdd, ignore_index = True)
            result = result.reset_index(drop = True)
            
            
    print("Predicted for " + str(setOfYears[1]))
    
result.to_csv(r"..\pools_specific_EDA\Open River\allvars_interpolated_3yearsxseason.csv")
print("Done")

# Prediction group year by year

Here, we do predictions for missing data for a given year using the entire year's worth of data. As such, we use the `predict` function instead of the `predict_years` function. Here as well, we create a year column from the date in the data frame, which we then use to generate a list of years to predict data for. 

In [None]:
newData = dataFrame.copy()
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
years = newData["YEAR"].unique()
result = pd.DataFrame()
for year in years:
    currentSet = newData[(newData["YEAR"] == year)]
    currentSet = currentSet.reset_index(drop = True)
    hashTable = construct_hashtable(currentSet)
    predict(currentSet, hashTable, "TN",2)
    predict(currentSet, hashTable, "TP",2)
    print("Predicted for " + str(year))
    result = result.append(currentSet)

result = result.reset_index(drop = True)
result.to_csv(r"C:\Users\forre\Desktop\REU\TDA\Data\predicted_tn_tp_years.csv")
print("Done")

# Spacial interpolation by year, by season (Casey)

Load functions - parameter definitons and correct documentation still needs to be completed

In [None]:
import pandas as pd
import numpy as np
import os
from geopy import distance
import time
import pickle
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures
import sklearn.metrics
pd.set_option('display.max_columns', None)

# useful functions and classes

# This class stores the latitude and longitude of a sample, and indicates 
# if this location has the desired variable we are estimating
class Location:
    def __init__(self,latitude,longitude,hasv,ID,value,SHEET):
        self.ID = ID
        self.SHEET = SHEET
        self.latitude = latitude
        self.longitude = longitude
        self.hasv = hasv
        self.value = value
        
    def __str__(self):
        return str(self.ID)

# Calculates the distance between 2 samples in km
def getdist(S1,S2):
    # radius of earth in km
    coords_1 = (S1.latitude, S1.longitude)
    coords_2 = (S2.latitude, S2.longitude)
    dist = distance.distance(coords_1, coords_2).km
    return dist


# PRE: all locations in the dataframe are
# unique
def DistanceMatrix(dataframe,variable):
    # numunique = len(dataframe["LOCATCD"].unique())
    # numlocations = dataframe.shape[0]
    # try:
    #     assert(numunique == numlocations), f"{numunique} unique locations but {numlocations} number of locations"
    
    # except AssertionError as msg:
    #     print(dataframe[dataframe["LOCATCD"].duplicated(keep=False)])
    #     print(msg)
        
    # the list of location objects
    locations = []
    # the list of indexes where the the row is located in the dataframe
    #indexes = []
    for index,row in dataframe.iterrows():
        # make a location object on this row
        if pd.isnull(row[variable]):
            hasv = False
        else:
            hasv = True
        locations.append(Location(row["LATITUDE"],row["LONGITUDE"],hasv,row["LOCATCD"],row[variable],row["SHEETBAR"]))
        #indexes.append(index)
        
    matrix = pd.DataFrame(0,index=locations,columns=locations)
    for ci,column in enumerate(locations):
        for ri,row in enumerate(locations):
            if ri>ci:
                # compute distance between column and row
                dist = getdist(row,column)
            elif ci>ri:
                dist = matrix.iloc[ci,ri]
            # put this distance in the dataframe
            else:
                continue
            matrix.iloc[ri,ci] = dist
    return matrix

def changeVar(DM,dataframe,variable):
    locations = DM.index
    # loop through each location
    for i,loc in enumerate(locations):
        SHEET = loc.SHEET
        row = dataframe.loc[dataframe["SHEETBAR"]==SHEET]
        #print(row)
        #print(row.shape)
        #print(row.loc[row.index[0],variable])
        #print(type(row[variable]))
        try:
            assert(row.shape[0]==1), "Multiple rows with same SHEETBAR"
        except AssertionError as msg:
            print(dataframe[dataframe["SHEETBAR"].duplicated(keep=False)])
            print(msg)
            
        # Pull value of desired variable
        val = row.loc[row.index[0],variable]
        if pd.isnull(val):
            locations[i].hasv = False
            locations[i].value = None
        else:
            locations[i].hasv = True
            locations[i].value = val
            
    DM.index = locations
    DM.columns = locations
        
def getclosest(numclosest,distancematrix,location):
    column = distancematrix.loc[:,location].copy()
    #print(type(distancematrix.index[0]))
    # Filter the locations that dont have the desired variable
    doesnthavev = []
    for i in range(len(column)):
        if not column.index[i].hasv:
            doesnthavev.append(column.index[i])
            
    # Get rid of locations that dont have the desired variable
    column.drop(doesnthavev,inplace = True)
    # Get rid of the location we are predicting for if it exists
    column.drop(location,inplace = True,errors="ignore")
    #print(type(column))
    column.sort_values(inplace = True)
    
    return column.iloc[0:numclosest]

# Key: Location Codes that need predicting
# Value: List of tuples (locatcd,distance,value)
def makeDict(DM,numclosest,testing):
    # Loop through each location without a value for variable
    closestDict = {}
    for loc in DM.columns:
        if not loc.hasv or testing:
            # Get the closest locations to loc THAT ISN'T LOC
            closest = getclosest(numclosest,DM,loc)
            # The list of tuples that contain location id, the distance, and the value for variable
            tuples = []
            for i,dist in enumerate(closest):
                SHEET = closest.index[i].SHEET
                val = closest.index[i].value
                tuples.append((SHEET,dist,val))
            closestDict[loc.SHEET] = tuples
    return closestDict

def predict(tuples,numclosest = 2):
    loc2 = tuples[0]
    loc3 = tuples[1]
    d12 = loc2[1]
    val2 = loc2[2]
    d13 = loc3[1]
    val3 = loc3[2]
    
    if d12 == d13:
        return 0.5*d12+0.5*d13
    elif d12 == 0:
        return val2
    elif d13 == 0:
        return val3
    
    else:
        c2 = d12/(d12+d13)
        c3 = d13/(d12+d13)
        
        predicted = c2*val2+c3*val3
    
        return predicted

      
'''
data - the pandas dataframe that is ready to interpolate missing values
MUST HAVE "LATITUDE", "LONGITUDE","YEAR", "TIME CODE", "LOCATCD" columns

missing_vars - the list of column names (as strings) of the dataframe that we should attempt to fill in

numlocations - the number of locations used to predict the new value, default is 2 (currently the only option implemented)

RETURN - a dataframe with extra columns saying the predicted values of the missing_vars
'''
def linear_interpolate(data,missing_vars,numlocations = 2,testing = False,verbosity = 0):
    
    print("Building a new dataframe with predicted values")
    start_time = time.time()
    # Testing for duplicated locations if needed
    #s = qualdata_noprediction["LOCATCD"].duplicated(keep=False)
    # get the years and timecodes for this dataset
    # predictions can only be made if the point is in the same year and time code (what if we don't need to do this)
    years = data["YEAR"].unique()
    seasons = data["SEASON"].unique()
    pools = data["FLDNUM"].unique()
    data_prediction = pd.DataFrame()
    for pool in pools:
        for year in years:
            for season in seasons:
                if verbosity > 0:
                    print(f"Appending predicted data for {year}  {season}  FLDNUM {pool}")
                # curset is the current set of rows we are predicting for
                curset = data[(data["YEAR"]==year) & (data["SEASON"]==season) & (data["FLDNUM"]==pool)].copy()
                
                if verbosity > 1:
                    print("Size of this year and season:", curset.shape)
                
                # Boolean to indicate if variable in Distance matrix needs updating
                first = True
                for var in missing_vars:
                    newcolumn = "PREDICTED_"+var
                    curset[newcolumn] = 0
                
                    #check to see if there are enough valid locations
                    # that can be used to predict
                    if not testing:
                        bad = bool((curset[var].notnull().sum()<numlocations))
                    else:
                        bad = bool((curset[var].notnull().sum()<numlocations+1))
                    
    
                    if(bad):
                        if verbosity > 2:
                            print("Less than "+str(numlocations)+" locations have "+var+" in this set, dropping rows without "+var)
                        curset = curset[curset[var].notnull()]
                        curset[newcolumn] = curset[var]
                        if verbosity > 2:
                            print("Current set is now ",curset.shape)
                    else:
                        if first:
                            if verbosity > 2:
                                print("Creating DM with ",var)
                            DM = DistanceMatrix(curset,var)
                            first = False
                        else:
                            if verbosity > 2:
                                print("Changing to ",var)
                            changeVar(DM,curset,var)
                            
                        # Returns a dictionary mapping each location code to a tuple with prediction information
                        Dict = makeDict(DM,numlocations,testing)
                        
                        #put in predicted variable
                        for index,row in curset.iterrows():
                            if pd.isnull(row[var]) or testing:
                                try:
                                    prediction = predict(Dict[row["SHEETBAR"]])
                                    #print(curset.loc[index,newcolumn],prediction)
                                    curset.loc[index,newcolumn] = prediction
                                except ZeroDivisionError:
                                    print("Couldn't predict for ", str(row["SHEETBAR"]))
                                    print(Dict[row["SHEETBAR"]])
                                    curset.loc[index,newcolumn] = None
                            else:
                                curset.loc[index,newcolumn] = row[var]
    
                data_prediction = data_prediction.append(curset,ignore_index=True)  
    
    if verbosity > 0:
        print("Final data set size is ",data_prediction.shape)
    print(f"Interpolating took {(time.time()-start_time)/60} minutes")
    return data_prediction

#### Interpolating missing data

In [None]:
missing_vars = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI']
water_interpolated = linear_interpolate(newData,missing_vars)

# Multivariate Regression

Filter for non missing data

In [None]:
cols = ['TP','TN','CHLcal','SS','VEL','DO','COND','WDP','TURB','TEMP','SECCHI']
print("Filtering out all rows with missing data")
qualdata = water_data.dropna(axis=0, how='any', thresh=None, subset=cols, inplace=False).copy()
print(qualdata.shape)
print("Filtering out colums that we dont need")
qualdata.drop(qualdata.columns.difference(cols), 1, inplace=True)
print(qualdata.shape)

In [None]:
# Model Hyperparameter: Set the degree of the polynomial to fit
d = 5


X = np.array(qualdata[['CHLcal','SS','VEL','DO','COND','WDP','TURB','TEMP','SECCHI']])
TP = np.array(qualdata['TP'])
TN = np.array(qualdata['TN'])

# Good idea to standardize predictor attributes - assumes each variable has a decently normal distribution
scaler = RobustScaler().fit(X)
X_standard = scaler.transform(X)


# The PolynomialFeatures class in sklearn.preprocessing can be used to transform a data matrix by
# adding higher-order and interaction terms for the existing features. It also adds a "zeroth"
# column consisting of all 1's that corresponds to the weight w_0 in a regression model.
poly = PolynomialFeatures(d)

# We "fit" the poly object to our data matrix to allow it to identify the structure of the data
# (notably the number of attributes, or columns, in the data matrix).
poly.fit(X_standard)
#poly.fit(X)


# Now we use poly.transform to add any higher-order terms to the data matrix.
# This also adds a zeroth attribute which is set to all 1's.
augmented_X = poly.transform(X_standard)
#augmented_X = poly.transform(X)


# Next we create a linear regression object (named lm for "linear model").
# Because our augmented data matrix includes an all 1's column, we don't
# need to fit the intercept (w_0) here.
TP_lm = LinearRegression(fit_intercept=False)
TN_lm = LinearRegression(fit_intercept=False)


# Split data into train and test for each
TPX_train, TPX_test, TPy_train, TPy_test = train_test_split(augmented_X, TP, train_size=0.7)
TNX_train, TNX_test, TNy_train, TNy_test = train_test_split(augmented_X, TN, train_size=0.7)

# Fit models using training data
TP_lm.fit(TPX_train, TPy_train)
TN_lm.fit(TNX_train, TNy_train)

# After fitting the regression model, we can estimate the error
# Get training errors
TP_train_err = np.mean((TPy_train - TP_lm.predict(TPX_train)) ** 2)
TN_train_err = np.mean((TNy_train - TN_lm.predict(TNX_train)) ** 2)

# Get test errors
TP_test_err = np.mean((TPy_test - TP_lm.predict(TPX_test)) ** 2)
TN_test_err = np.mean((TNy_test - TN_lm.predict(TNX_test)) ** 2)

# Report
print("TP training set mean squared error: {:.6f}".format(TP_train_err)," on average off {:.6f}".format(np.sqrt(TP_train_err)))
print("TN training set mean squared error: {:.6f}".format(TN_train_err)," on average off {:.6f}".format(np.sqrt(TN_train_err)),"\n")
print("TP test set mean squared error: {:.6f}".format(TP_test_err)," on average off {:.6f}".format(np.sqrt(TP_test_err)))
print("TN test set mean squared error: {:.6f}".format(TN_test_err)," on average off {:.6f}".format(np.sqrt(TN_test_err)),"\n")