# Imports
This the script for prepossing the data to create data ready for the TDA mapper algorithm. This has the implementation for the IDW weighting methods, both with a single season, as well as multiple yearsadjacent to it.

Written by: Killian Davis & Frederick "Forrest" Miller (alphabetical order)

In [1]:
import sys
import pandas as pd
from geopy import distance
import math
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.preprocessing import PolynomialFeatures
print("imports done")
pd.set_option("display.max_rows", None)

imports done


# File
This uses the `water_data_qfneg.csv` file that is cleaned and ready to have missing values interpolated. It creates a pandas dataframe.

In [2]:
filePath = r"..\LTRM data\water_data_qfneg.csv"
dataFrame = pd.read_csv(filePath, low_memory = False)
print("dataFrame Made")

dataFrame Made


# Filter for your pool
Resets the index as well. This is done through the `FLDNUM` parameter, and can be found in accompanying documentation for the correct pool number. 

In [4]:
dataFrame = dataFrame[dataFrame['FLDNUM'] == 4]
print(dataFrame.shape)
dataFrame  = dataFrame.reset_index(drop = True)
dataFrame.head()

(11449, 18)


Unnamed: 0,SHEETBAR,DATE,LATITUDE,LONGITUDE,FLDNUM,STRATUM,LOCATCD,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI
0,44000159,10/19/1993,38.873994,-90.189663,4,5,9343058,2.769,0.145,15.7,8.5,37.0,424.0,0.0,40.3,0.5,24.29702,27.0
1,44000160,10/19/1993,38.873772,-90.180452,4,5,9343059,3.049,0.146,15.7,8.3,43.0,424.0,0.0,48.6,1.1,26.6471,30.0
2,44000161,10/19/1993,38.869837,-90.166778,4,5,9343063,3.267,0.158,15.2,8.2,39.0,444.0,0.0,36.5,0.7,32.82694,32.0
3,44000162,10/19/1993,38.864269,-90.160085,4,5,9343065,3.345,0.161,15.2,8.4,37.0,456.0,0.0,43.3,1.5,42.57542,28.0
4,44000163,10/19/1993,38.877205,-90.173401,4,1,9343010,3.661,0.183,14.9,9.1,54.0,457.0,0.55,79.3,10.4,68.25222,26.0


# Functions
Here, we interpolate for missing data values. These occur when the data set has a missing value. The way it is computed utilizes a $k$-nearest neighbors algorithm. A weighted average using the $k$ nearest points is used to compute the missing value, and it appends it to a new column in the data set called `"PREDICTED_" + variable`, where `variable` is what we wish to interpolate (`TN` or `TP`) for example.

In [5]:
"""
Params:
df = the dataframe filtered for the pool
hashtable = the hash table of distances of each point for the data frame. (Created from construct_hashtable)
naVar = the variable we wish to interpolate
year = the year we wish to predict for
k = the number of terms in the weighted average for interpolation

NOTE: for now, set k = 2 due to potential bug for larger k

This is one of two predict functions. 
Here, multiple years worth of data can put in, and only the specified year will be predicted
and added to the dataframe. Note that this function will find the k nearnest neighbors using df,
regardless of year.
"""
def predict_years(df, hashtable, naVar, year, k):
    df["PREDICTED_" + naVar] = df[naVar]
    df_year = df.copy()
    df_year = df_year[df_year["YEAR"] == year]
    naIndices = df_year[(df_year[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")
"""
Params:
df = the dataframe filtered for the pool
hashtable = the hash table of distances of each point for the data frame.
naVar = the variable we wish to interpolate
k = the number of terms in the weighted average for interpolation

NOTE: for now, keep k = 2 due to potential bug for k > 2

This predict function is more crude than predict_years. It will predict using missing values of naVar for 
the entire dataframe, using the entire dataframe to locate the k nearnest neighbors.
"""
def predict(df, hashtable, naVar, k):
    df["PREDICTED_" + naVar] = df[naVar]
    naIndices = df[(df[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

    
"""
params:

minimums = lower bound of data, typically latitude or longitude
maximums = upper bound of data, typically latitude or longitude
x = data we wish to transfrom in to [0,1], typically a latitude or a longitude

This is a helper function for construct_hashtable and k_nearest_neighbors. It takes in a latitude or
a longitude and maps to in to [0,1] so the point can be plotted properly in the hashtable
"""    
    
def transform(minimum, maximum, x):
    return (1 / (maximum - minimum) ) * (x - minimum)
"""
Params:
point 1 = First point (latitude and longitude)
point 2 = Second point (latitude and longitude)

Returns the distance in kilometers between two points in space, using
scipy distance function.
"""
def dist(point1, point2):
    return distance.distance(point1, point2).km
"""
params:
df = the dataframe

returns: hashtable (list of lists of lists of tuples(index, latitude, longitude))

Constructs a hash table of locations of points (the position where data is recorded)
This is used in the k nearest neighbors algorithm. Locations that are near each other in space
are near each other in the hashtable

"""    
def construct_hashtable(df):
    #get hashtable information
    data_length = math.sqrt(df.shape[0])
    #print("data_length: " + str(data_length))
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    #construct hashtable
    hashtable = [[[] for x in range(int(data_length)+1)] for y in range(int(data_length)+1)]
    
    #populate hashtable
    for index, row in df.iterrows():
        r_lat = row['LATITUDE']
        r_long = row['LONGITUDE']
        lat = math.floor(transform(lat_minimum, lat_maximum, r_lat) / interval_length)
        long = math.floor(transform(long_minimum, long_maximum, r_long) / interval_length)
        #print("lat: " + str(lat))
        #print("long: " + str(long))
        hashtable[lat][long].append((index, r_lat, r_long))

    return hashtable
"""
Params:
df = dataframe
index = index of variable we wish to find k nearest neighbors of
naVar = variable to predict
hashtable = data structure created from construct_hashtable
k = number of nearest neighbors

Returns: (distances, indices) of k nearnest neighbors

This algorithm will find the k nearest neighbors of a desired point using the hashtable, if possible.
If there are no valid points near the given point, then the algorithm will use brute force
"""
def k_nearest_neighbors(df, index, naVar, hashtable, k):

    distances = []
    neighbor_indices = []
    neighbors = {}
    
    data_length = math.sqrt(df.shape[0])
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    
    row_na = df.loc[index]
    point_na = (row_na['LATITUDE'], row_na['LONGITUDE'])
    lat = math.floor(transform(lat_minimum, lat_maximum, point_na[0]) / interval_length)
    long = math.floor(transform(long_minimum, long_maximum, point_na[1]) / interval_length)
    season = row_na['SEASON']
    
    for inx, latitude, longitude in hashtable[lat][long]:
        distance_km = dist(point_na, (latitude, longitude))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    
    if lat != 0:
        
        for inx, latitude, longitude in hashtable[lat - 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat - 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat - 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if lat + 1 != len(hashtable):
        
        for inx, latitude, longitude in hashtable[lat + 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat + 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat + 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if long != 0:
        for inx, latitude, longitude in hashtable[lat][long - 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
        
    if long + 1 != len(hashtable):
        for inx, latitude, longitude in hashtable[lat][long + 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
                    
    #Possible bug with neighbor dictionary for k > 2
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    if len(neighbor_indices) < k and len(neighbor_indices) != 0:
        print("INTERPOLATING WITH " + str(len(neighbor_indices)) + " POINTS INSTEAD OF " + str(k) + " POINTS")
    if len(neighbor_indices) >= 2:
        return (distances, neighbor_indices)
    
    #Possible bug with neighbor dictionary for k > 2
    distances = []
    neighbors = {}
    neighbor_indices = []
    for inx, row in df.iterrows():
        distance_km = dist(point_na, (row['LATITUDE'], row['LONGITUDE']))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    return (distances, neighbor_indices)            
    
"""
Params:
df = dataframe
distances = distances of the points being used to interpolate for the missing value 
neighbors  = the points for prediction
naVar = variable to predict

Returns the interpolated data value for a missing datapoint.
"""
def interpolate(df, distances, neighbors, naVar):
    result = 0
    denominator = [1 / x for x in distances]
    denominator = sum(denominator)
    for i in range(len(distances)):
        result += ((1/distances[i]) / denominator) * df.loc[neighbors[i]][naVar]
    return result
print("Functions have been loaded")

Functions have been loaded


# Season by Season Interpolation
Here, we begin the process of interpolating for missing data based upon the season. To do this, the dataframe we input needs a season column. To obtain this, we create a copy of the dataframe, and use this copy throughout the rest of the work. To obtain the season, we utilize the date recorded. We create a column for the particular month, and then use a dictionary to replace that value with the appropriate season.

In [6]:
df = dataFrame.copy()
df["MONTH"] = pd.DatetimeIndex(dataFrame["DATE"]).month
df["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
df["SEASON"] = df["MONTH"]
seasons = {3 : 'SPRING',
           4 : 'SPRING',
           5 : 'SPRING',
           6 : 'SUMMER',
           7 : 'SUMMER',
           8 : 'SUMMER',
           9 : 'FALL',
           10 : 'FALL',
           11: 'FALL',
           12: 'WINTER',
           1: 'WINTER',
           2: 'WINTER'}
df = df.replace({"SEASON" : seasons})
# for index, row in newData.iterrows():
#     newData.loc[index, 'SEASON'] = seasons
df.head()

Unnamed: 0,SHEETBAR,DATE,LATITUDE,LONGITUDE,FLDNUM,STRATUM,LOCATCD,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI,MONTH,YEAR,SEASON
0,44000159,10/19/1993,38.873994,-90.189663,4,5,9343058,2.769,0.145,15.7,8.5,37.0,424.0,0.0,40.3,0.5,24.29702,27.0,10,1993,FALL
1,44000160,10/19/1993,38.873772,-90.180452,4,5,9343059,3.049,0.146,15.7,8.3,43.0,424.0,0.0,48.6,1.1,26.6471,30.0,10,1993,FALL
2,44000161,10/19/1993,38.869837,-90.166778,4,5,9343063,3.267,0.158,15.2,8.2,39.0,444.0,0.0,36.5,0.7,32.82694,32.0,10,1993,FALL
3,44000162,10/19/1993,38.864269,-90.160085,4,5,9343065,3.345,0.161,15.2,8.4,37.0,456.0,0.0,43.3,1.5,42.57542,28.0,10,1993,FALL
4,44000163,10/19/1993,38.877205,-90.173401,4,1,9343010,3.661,0.183,14.9,9.1,54.0,457.0,0.55,79.3,10.4,68.25222,26.0,10,1993,FALL


# Prediction Group by Season and several years

Here, we interpolate missing data values for a certain year, grouping data by season. For example, we can use spring data from 2001, 2002, and 2003 to predict spring data for 2002. In the list below, `x` represents the year we predict. Modify the lower bound to be the earliest year of data that you have.

Then, we create a year column on the data frame to allow the predict function to get the correct year for prediction purposes. Then, we interpolate data values year by year, season by season. After it is done, it sends the output to a `.csv` file, so modify the path as necessary. The `if (seasonFrame.shape[0] > 1)` is a check to make sure there is enough data present to do any interpolating, since we use `2` points to predict a missing third here.


In [None]:
continuous = ['TN','TP','TEMP','DO','TURB','COND','VEL','SS','WDP','CHLcal','SECCHI'] # variables to interpolate
years = [[x-1, x, x+1] for x in range(1995, 2021)] # start and end years from LTRM dataset
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
result = pd.DataFrame()
for setOfYears in years:
    print("Set of years: " + str(setOfYears))
    print("Year to interpolate missing data: " + str(setOfYears[1]))
    threeYearFrame = newData[newData['YEAR'].isin(setOfYears)]
    seasons = ['SPRING','SUMMER','FALL','WINTER']
    for season in seasons:
        print(season)
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            for var in continuous:
                predict_years(seasonalFrame, seasonalHash, var,setOfYears[1],2)
            
            yearToAdd = seasonalFrame[seasonalFrame['YEAR'] == setOfYears[1]]
            result = result.append(yearToAdd, ignore_index = True)
            result = result.reset_index(drop = True)
            
            
    print("Predicted for " + str(setOfYears[1]))
    
result.to_csv(r"..\pools_specific_EDA\Open River\allvars_interpolated_3yearsxseason.csv") # can change outputs
print("Done")

# Prediction group year by year

Here, we do predictions for missing data for a given year using the entire year's worth of data. As such, we use the `predict` function instead of the `predict_years` function. Here as well, we create a year column from the date in the data frame, which we then use to generate a list of years to predict data for. 

In [None]:
newData = dataFrame.copy()
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
years = newData["YEAR"].unique()
result = pd.DataFrame()
for year in years:
    currentSet = newData[(newData["YEAR"] == year)]
    currentSet = currentSet.reset_index(drop = True)
    hashTable = construct_hashtable(currentSet)
    predict(currentSet, hashTable, "TN",2)
    predict(currentSet, hashTable, "TP",2)
    print("Predicted for " + str(year))
    result = result.append(currentSet)

result = result.reset_index(drop = True)
result.to_csv(r"C:\Users\forre\Desktop\REU\TDA\Data\predicted_tn_tp_years.csv") # can change
print("Done")