# Imports
This the script for prepossing the data to create data ready for the TDA mapper algorithm.

In [6]:
import sys
import pandas as pd
from geopy import distance
import math
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("imports done")
pd.set_option("display.max_rows", None)


imports done


# File
From the resulting `R` script written by Amber and `python` script written by Alaina, grab the cleaned data file.

In [7]:
filePath = r"C:\Users\forre\Desktop\REU\TDA\github\UMR-TDA-2021\LTRM data\cleaned_data.csv"
dataFrame = pd.read_csv(filePath, low_memory = False)
print("dataFrame Made")

dataFrame Made


# Filter for your pool
Resets the index as well.


In [9]:
dataFrame = dataFrame[dataFrame['FLDNUM'] == 4]
print(dataFrame.shape)
dataFrame  = dataFrame.reset_index(drop = True)
dataFrame.head()

(14953, 18)


Unnamed: 0,SHEETBAR,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI,LONGITUDE,LATITUDE,DATE,FLDNUM,LOCATCD,STRATUM
0,44002201,,,2.4,14.2,15.0,455.0,,-0.2,9.5,,53.0,-90.683117,39.004773,11/29/1995,4,M241.4K,
1,44002202,,,2.2,14.1,11.0,459.0,0.0,-2.0,0.94,,64.0,-90.673138,38.940668,11/29/1995,4,M237.2G,
2,44002759,1.323,,28.6,3.8,66.0,306.0,,76.9,3.2,,,-90.549129,38.852878,06/24/1996,4,DC01.0M,
3,44002760,2.562,,30.6,6.3,93.0,270.0,,54.6,,,,-90.748878,38.924233,06/24/1996,4,CU11.6M,
4,44002761,2.625,,30.1,10.0,29.0,337.0,,35.6,,,,-90.657982,38.883755,06/24/1996,4,PE01.8M,


# Functions
Here, we interpolate for missing data values. These occur when the ltrm data set has a missing value. The way it is is computed utilizes a $k$-nearest neighbors approach. A weighted average using the $k$ nearest points is used to compute the missing value.

In [11]:
def predict_years(df, hashtable, naVar, year, k):
    df["PREDICTED_" + naVar] = df[naVar]
    df_year = df.copy()
    df_year = df_year[df_year["YEAR"] == year]
    naIndices = df_year[(df_year[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

def predict(df, hashtable, naVar, k):
    df["PREDICTED_" + naVar] = df[naVar]
    naIndices = df[(df[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

def transform(minimum, maximum, x):
    return (1 / (maximum - minimum) ) * (x - minimum)

def dist(point1, point2):
    return distance.distance(point1, point2).km
    
def construct_hashtable(df):
    #get hashtable information
    data_length = math.sqrt(df.shape[0])
    #print("data_length: " + str(data_length))
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    #construct hashtable
    hashtable = [[[] for x in range(int(data_length)+1)] for y in range(int(data_length)+1)]
    
    #populate hashtable
    for index, row in df.iterrows():
        r_lat = row['LATITUDE']
        r_long = row['LONGITUDE']
        lat = math.floor(transform(lat_minimum, lat_maximum, r_lat) / interval_length)
        long = math.floor(transform(long_minimum, long_maximum, r_long) / interval_length)
        #print("lat: " + str(lat))
        #print("long: " + str(long))
        hashtable[lat][long].append((index, r_lat, r_long))

    return hashtable

def k_nearest_neighbors(df, index, naVar, hashtable, k):

    distances = []
    neighbor_indices = []
    neighbors = {}
    
    data_length = math.sqrt(df.shape[0])
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    
    row_na = df.loc[index]
    point_na = (row_na['LATITUDE'], row_na['LONGITUDE'])
    lat = math.floor(transform(lat_minimum, lat_maximum, point_na[0]) / interval_length)
    long = math.floor(transform(long_minimum, long_maximum, point_na[1]) / interval_length)
    season = row_na['SEASON']
    
    for inx, latitude, longitude in hashtable[lat][long]:
        distance_km = dist(point_na, (latitude, longitude))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    
    if lat != 0:
        
        for inx, latitude, longitude in hashtable[lat - 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat - 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat - 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if lat + 1 != len(hashtable):
        
        for inx, latitude, longitude in hashtable[lat + 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat + 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat + 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if long != 0:
        for inx, latitude, longitude in hashtable[lat][long - 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
        
    if long + 1 != len(hashtable):
        for inx, latitude, longitude in hashtable[lat][long + 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
    
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    if len(neighbor_indices) < k and len(neighbor_indices) != 0:
        print("INTERPOLATING WITH " + str(len(neighbor_indices)) + " POINTS INSTEAD OF " + str(k) + " POINTS")
    if len(neighbor_indices) >= 2:
        return (distances, neighbor_indices)
    
    distances = []
    neighbors = {}
    neighbor_indices = []
    for inx, row in df.iterrows():
        distance_km = dist(point_na, (row['LATITUDE'], row['LONGITUDE']))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    return (distances, neighbor_indices)            
    

def interpolate(df, distances, neighbors, naVar):
    result = 0
    denominator = [1 / x for x in distances]
    denominator = sum(denominator)
    for i in range(len(distances)):
        result += ((1/distances[i]) / denominator) * df.loc[neighbors[i]][naVar]
    return result
print("Functions have been loaded")

Functions have been loaded


# Creates a season data column
Utilizes the date recorded to get the season. This allows predictions to be made within the same season (across several years)

In [12]:
newData = dataFrame.copy()
newData["MONTH"] = pd.DatetimeIndex(dataFrame["DATE"]).month
newData["SEASON"] = newData["MONTH"]
seasons = {3 : 'SPRING',
           4 : 'SPRING',
           5 : 'SPRING',
           6 : 'SUMMER',
           7 : 'SUMMER',
           8 : 'SUMMER',
           9 : 'FALL',
           10 : 'FALL',
           11: 'FALL',
           12: 'WINTER',
           1: 'WINTER',
           2: 'WINTER'}
newData = newData.replace({"SEASON" : seasons})
# for index, row in newData.iterrows():
#     newData.loc[index, 'SEASON'] = seasons
newData.head()

Unnamed: 0,SHEETBAR,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI,LONGITUDE,LATITUDE,DATE,FLDNUM,LOCATCD,STRATUM,MONTH,SEASON
0,44002201,,,2.4,14.2,15.0,455.0,,-0.2,9.5,,53.0,-90.683117,39.004773,11/29/1995,4,M241.4K,,11,FALL
1,44002202,,,2.2,14.1,11.0,459.0,0.0,-2.0,0.94,,64.0,-90.673138,38.940668,11/29/1995,4,M237.2G,,11,FALL
2,44002759,1.323,,28.6,3.8,66.0,306.0,,76.9,3.2,,,-90.549129,38.852878,06/24/1996,4,DC01.0M,,6,SUMMER
3,44002760,2.562,,30.6,6.3,93.0,270.0,,54.6,,,,-90.748878,38.924233,06/24/1996,4,CU11.6M,,6,SUMMER
4,44002761,2.625,,30.1,10.0,29.0,337.0,,35.6,,,,-90.657982,38.883755,06/24/1996,4,PE01.8M,,6,SUMMER


# Prediction Group by Season and several years
Edit the years range start based upon the particular pool.



In [None]:
years = [[x-1, x, x+1] for x in range(1995, 2021)]
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
result = pd.DataFrame()
for setOfYears in years:
    print("Set of years: " + str(setOfYears))
    print("Year to interpolate missing data: " + str(setOfYears[1]))
    threeYearFrame = newData[newData['YEAR'].isin(setOfYears)]
    seasons = ['SPRING','SUMMER','FALL','WINTER']
    for season in seasons:
        print(season)
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            predict_years(seasonalFrame, seasonalHash, "TN",setOfYears[1],2)
            predict_years(seasonalFrame, seasonalHash, "TP", setOfYears[1],2)
            yearToAdd = seasonalFrame[seasonalFrame['YEAR'] == setOfYears[1]]
            result = result.append(yearToAdd, ignore_index = True)
            result = result.reset_index(drop = True)
            
            
    print("Predicted for " + str(setOfYears[1]))
    
result.to_csv(r"C:\Users\forre\Desktop\REU\TDA\Data\predicted_tn_tp_OverlappingYearsAndSeasons.csv")
print("Done")

Set of years: [1994, 1995, 1996]
Year to interpolate missing data: 1995
SPRING
SUMMER
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 0 points.
TP interpolation success
FALL
For TN we will interpolate 2 points.
TN interpolation success
For TP we will interpolate 2 points.
TP interpolation success
WINTER
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 0 points.
TP interpolation success
Predicted for 1995
Set of years: [1995, 1996, 1997]
Year to interpolate missing data: 1996
SPRING
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 0 points.
TP interpolation success
SUMMER
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 35 points.
TP interpolation success
FALL
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 145 points.
TP interpolation success
WINTER
For TN we will interpolate 0 points.
T

# Prediction group year by year


Here, we filter for the pool we are curious about, then run the interpolation code for the missing values. Copy the "predict" function for the new variables, and the number is the number of nearest neighbors that you want. When it is done, the data is displayed. To save the output, use pd.to_csv(path = ), and set the path to where you want the data frame to be saved.

If you want to predict for one specific year only using data from that year, filter your data frame by the desired year, then also pass in that year to that predict function as follows.

In [8]:
newData = dataFrame.copy()
newData["YEAR"] = pd.DatetimeIndex(dataFrame["DATE"]).year
years = newData["YEAR"].unique()
result = pd.DataFrame()
for year in years:
    currentSet = newData[(newData["YEAR"] == year)]
    currentSet = currentSet.reset_index(drop = True)
    hashTable = construct_hashtable(currentSet)
    predict(currentSet, hashTable, "TN",2)
    predict(currentSet, hashTable, "TP",2)
    print("Predicted for " + str(year))
    result = result.append(currentSet)

result = result.reset_index(drop = True)
result.to_csv(r"C:\Users\forre\Desktop\REU\TDA\Data\predicted_tn_tp_years.csv")
print("Done")

For TN we will interpolate 2 points.
TN interpolation success
For TP we will interpolate 2 points.
TP interpolation success
Predicted for 1995
For TN we will interpolate 0 points.
TN interpolation success
For TP we will interpolate 180 points.
TP interpolation success
Predicted for 1996
For TN we will interpolate 200 points.
TN interpolation success
For TP we will interpolate 200 points.
TP interpolation success
Predicted for 1997
Done
