In [1]:
import sys
import pandas as pd
from geopy import distance
import math
import sklearn.metrics
import statistics
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
filePath = r"C:\Users\killiad\Documents\Senior\REU\Data\cleaned_data.csv"
df = pd.read_csv(filePath, low_memory = False)
df = df[df['FLDNUM']==4]
df["MONTH"] = pd.DatetimeIndex(df["DATE"]).month
df["SEASON"] = df["MONTH"]
seasons = {3 : 'SPRING',
           4 : 'SPRING',
           5 : 'SPRING',
           6 : 'SUMMER',
           7 : 'SUMMER',
           8 : 'SUMMER',
           9 : 'FALL',
           10 : 'FALL',
           11: 'FALL',
           12: 'WINTER',
           1: 'WINTER',
           2: 'WINTER'}
df = df.replace({"SEASON" : seasons})
df["YEAR"] = pd.DatetimeIndex(df["DATE"]).year

In [3]:
def predict_years(df, hashtable, naVar, year, k):
    df["PREDICTED_" + naVar] = df[naVar]
    df_year = df.copy()
    df_year = df_year[df_year["YEAR"] == year]
    naIndices = df_year[(df_year[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

def predict(df, hashtable, naVar, k):
    df["PREDICTED_" + naVar] = df[naVar]
    naIndices = df[(df[naVar].isnull())]
    print("For " + naVar + " we will interpolate " + str(len(naIndices)) + " points.")
    for index, row in naIndices.iterrows():
        distances, neighbors = k_nearest_neighbors(df, index, naVar, hashtable, k)
        df.loc[index, "PREDICTED_" + naVar] = interpolate(df, distances, neighbors, naVar)
    print(naVar + " interpolation success")

def transform(minimum, maximum, x):
    return (1 / (maximum - minimum) ) * (x - minimum)

def dist(point1, point2):
    return distance.distance(point1, point2).km
    
def construct_hashtable(df):
    #get hashtable information
    data_length = math.sqrt(df.shape[0])
    #print("data_length: " + str(data_length))
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    #construct hashtable
    hashtable = [[[] for x in range(int(data_length)+1)] for y in range(int(data_length)+1)]
    
    #populate hashtable
    for index, row in df.iterrows():
        r_lat = row['LATITUDE']
        r_long = row['LONGITUDE']
        lat = math.floor(transform(lat_minimum, lat_maximum, r_lat) / interval_length)
        long = math.floor(transform(long_minimum, long_maximum, r_long) / interval_length)
        #print("lat: " + str(lat))
        #print("long: " + str(long))
        hashtable[lat][long].append((index, r_lat, r_long))

    return hashtable

def k_nearest_neighbors(df, index, naVar, hashtable, k):

    distances = []
    neighbor_indices = []
    neighbors = {}
    
    data_length = math.sqrt(df.shape[0])
    interval_length = 1 / data_length
    lat_minimum = df[["LATITUDE"]].min()[0] - 1
    lat_maximum = df[["LATITUDE"]].max()[0] + 1
    long_minimum = df[["LONGITUDE"]].min()[0] - 1
    long_maximum = df[["LONGITUDE"]].max()[0] + 1
    
    
    row_na = df.loc[index]
    point_na = (row_na['LATITUDE'], row_na['LONGITUDE'])
    lat = math.floor(transform(lat_minimum, lat_maximum, point_na[0]) / interval_length)
    long = math.floor(transform(long_minimum, long_maximum, point_na[1]) / interval_length)
    season = row_na['SEASON']
    
    for inx, latitude, longitude in hashtable[lat][long]:
        distance_km = dist(point_na, (latitude, longitude))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    
    if lat != 0:
        
        for inx, latitude, longitude in hashtable[lat - 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat - 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat - 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if lat + 1 != len(hashtable):
        
        for inx, latitude, longitude in hashtable[lat + 1][long]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
                    
        if long != 0:
            for inx, latitude, longitude in hashtable[lat + 1][long - 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
        if long + 1 != len(hashtable):
            for inx, latitude, longitude in hashtable[lat + 1][long + 1]:
                distance_km = dist(point_na, (latitude, longitude))
                if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                    distances.append(distance_km)
                    if distance_km in neighbors.keys():
                        neighbors[distance_km].append(inx)
                    else:
                        neighbors[distance_km] = [inx]
        
    if long != 0:
        for inx, latitude, longitude in hashtable[lat][long - 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
        
    if long + 1 != len(hashtable):
        for inx, latitude, longitude in hashtable[lat][long + 1]:
            distance_km = dist(point_na, (latitude, longitude))
            if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
                distances.append(distance_km)
                if distance_km in neighbors.keys():
                    neighbors[distance_km].append(inx)
                else:
                    neighbors[distance_km] = [inx]
    
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    if len(neighbor_indices) < k and len(neighbor_indices) != 0:
        print("INTERPOLATING WITH " + str(len(neighbor_indices)) + " POINTS INSTEAD OF " + str(k) + " POINTS")
    if len(neighbor_indices) >= 2:
        return (distances, neighbor_indices)
    
    distances = []
    neighbors = {}
    neighbor_indices = []
    for inx, row in df.iterrows():
        distance_km = dist(point_na, (row['LATITUDE'], row['LONGITUDE']))
        if not np.isnan(df.loc[inx][naVar]) and distance_km != 0:
            distances.append(distance_km)
            if distance_km in neighbors.keys():
                neighbors[distance_km].append(inx)
            else:
                neighbors[distance_km] = [inx]
    distances.sort()
    distances = distances[0:k]
    for distance_km in distances:
        for inx in neighbors[distance_km]:
            neighbor_indices.append(inx)
    neighbor_indices = neighbor_indices[0:k]
    return (distances, neighbor_indices)            
    

def interpolate(df, distances, neighbors, naVar):
    result = 0
    denominator = [1 / x for x in distances]
    denominator = sum(denominator)
    for i in range(len(distances)):
        result += ((1/distances[i]) / denominator) * df.loc[neighbors[i]][naVar]
    return result

In [6]:
years = [[x-1, x, x+1] for x in range(1995, 2021)]
seasons = ['SPRING','SUMMER','FALL','WINTER']
result_tp = pd.DataFrame()
df_tp = df[df['TP'].notnull()].copy()
df["PREDICTED_TP"] = df['TP']
for setOfYears in years:
    print("Year: " + str(setOfYears[1]))
    threeYearFrame = df_tp[df_tp['YEAR'].isin(setOfYears)]
    for season in seasons:
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            df_year = seasonalFrame.copy()
            df_year = df_year[df_year["YEAR"] == setOfYears[1]]
            print("For " + "TP" + " we will interpolate " + str(len(df_year)) + " points.")
            for index, row in df_year.iterrows():
                distances, neighbors = k_nearest_neighbors(seasonalFrame, index, "TP", seasonalHash, 2)
                df.loc[index, "PREDICTED_TP"] = interpolate(df, distances, neighbors, "TP")
            print("TP" + " interpolation success")

Year: 1995
For TP we will interpolate 0 points.
TP interpolation success
For TP we will interpolate 0 points.
TP interpolation success
Year: 1996
For TP we will interpolate 0 points.
TP interpolation success
For TP we will interpolate 66 points.
TP interpolation success
For TP we will interpolate 0 points.
TP interpolation success
For TP we will interpolate 10 points.
TP interpolation success
Year: 1997
For TP we will interpolate 171 points.
TP interpolation success
For TP we will interpolate 210 points.
TP interpolation success
For TP we will interpolate 190 points.
TP interpolation success
For TP we will interpolate 164 points.
TP interpolation success
Year: 1998
For TP we will interpolate 186 points.
TP interpolation success
For TP we will interpolate 210 points.
TP interpolation success
For TP we will interpolate 191 points.
TP interpolation success
For TP we will interpolate 183 points.
TP interpolation success
Year: 1999
For TP we will interpolate 197 points.
TP interpolation suc

In [7]:
years = [[x-1, x, x+1] for x in range(1995, 2021)]
seasons = ['SPRING','SUMMER','FALL','WINTER']
result_tn = pd.DataFrame()
df_tn = df[df['TN'].notnull()].copy()
df["PREDICTED_TN"] = df['TN']
for setOfYears in years:
    print("Year: " + str(setOfYears[1]))
    threeYearFrame = df_tn[df_tn['YEAR'].isin(setOfYears)]
    for season in seasons:
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            df_year = seasonalFrame.copy()
            df_year = df_year[df_year["YEAR"] == setOfYears[1]]
            print("For " + "TN" + " we will interpolate " + str(len(df_year)) + " points.")
            for index, row in df_year.iterrows():
                distances, neighbors = k_nearest_neighbors(seasonalFrame, index, "TN", seasonalHash, 2)
                df.loc[index, "PREDICTED_TN"] = interpolate(df, distances, neighbors, "TN")
            print("TN" + " interpolation success")

Year: 1995
For TN we will interpolate 0 points.
TN interpolation success
For TN we will interpolate 0 points.
TN interpolation success
For TN we will interpolate 0 points.
TN interpolation success
Year: 1996
For TN we will interpolate 0 points.
TN interpolation success
For TN we will interpolate 101 points.
TN interpolation success
For TN we will interpolate 145 points.
TN interpolation success
For TN we will interpolate 10 points.
TN interpolation success
Year: 1997
For TN we will interpolate 171 points.
TN interpolation success
For TN we will interpolate 210 points.
TN interpolation success
For TN we will interpolate 190 points.
TN interpolation success
For TN we will interpolate 164 points.
TN interpolation success
Year: 1998
For TN we will interpolate 186 points.
TN interpolation success
For TN we will interpolate 210 points.
TN interpolation success
For TN we will interpolate 191 points.
TN interpolation success
For TN we will interpolate 183 points.
TN interpolation success
Year:

In [25]:
actual_tp = df[df['TP'].notnull()]
tp_mse = sklearn.metrics.mean_squared_error(actual_tp['TP'], actual_tp['PREDICTED_TP'])
tp_mae = sklearn.metrics.mean_absolute_error(actual_tp['TP'], actual_tp['PREDICTED_TP'])
tp_rmse = math.sqrt(sklearn.metrics.mean_squared_error(actual_tp['TP'], actual_tp['PREDICTED_TP']))
results_tp = pd.DataFrame()
results_tp['Actual'] = [actual_tp['TP'].describe()['count'], 
                        actual_tp['TP'].describe()['mean'],
                        actual_tp['TP'].describe()['std'],
                        actual_tp['TP'].describe()['min'], 
                        actual_tp['TP'].describe()['25%'], 
                        actual_tp['TP'].describe()['50%'],
                        actual_tp['TP'].describe()['75%'], 
                        actual_tp['TP'].describe()['max'], tp_mse, tp_rmse, tp_mae]
results_tp['Predicted'] = [actual_tp['PREDICTED_TP'].describe()['count'], 
                           actual_tp['PREDICTED_TP'].describe()['mean'],
                           actual_tp['PREDICTED_TP'].describe()['std'], 
                           actual_tp['PREDICTED_TP'].describe()['min'], 
                           actual_tp['PREDICTED_TP'].describe()['25%'], 
                           actual_tp['PREDICTED_TP'].describe()['50%'],
                           actual_tp['PREDICTED_TP'].describe()['75%'], 
                           actual_tp['PREDICTED_TP'].describe()['max'], None, None, None]
results_tp = results_tp.rename(index={0 : 'Count', 1 : 'Mean', 2 : 'STD', 3 : 'Min',
                             4 : '25%', 5 : '50%', 6 : '75%', 7 : 'Max', 8 : 'MSE', 9 : 'RMSE', 10 : 'MAE'})
display(results_tp)

Unnamed: 0,Actual,Predicted
Count,8540.0,8540.0
Mean,0.252823,0.245217
STD,0.211248,0.161933
Min,0.002,0.030878
25%,0.151,0.163711
50%,0.199,0.204975
75%,0.282,0.270516
Max,4.148,2.184316
MSE,0.049217,
RMSE,0.221849,


In [26]:
results_tp.to_csv(r"C:\Users\killiad\Documents\Senior\REU\Interpolation\tp_analysis_one_by_one.csv")

In [23]:
actual_tn = df[df['TN'].notnull()]
tn_mse = sklearn.metrics.mean_squared_error(actual_tn['TN'], actual_tn['PREDICTED_TN'])
tn_mae = sklearn.metrics.mean_absolute_error(actual_tn['TN'], actual_tn['PREDICTED_TN'])
tn_rmse = math.sqrt(sklearn.metrics.mean_squared_error(actual_tn['TN'], actual_tn['PREDICTED_TN']))
results_tn = pd.DataFrame()
results_tn['Actual'] = [actual_tn['TN'].describe()['count'], 
                        actual_tn['TN'].describe()['mean'], 
                        actual_tn['TN'].describe()['std'],
                        actual_tn['TN'].describe()['min'], 
                        actual_tn['TN'].describe()['25%'], 
                        actual_tn['TN'].describe()['50%'],
                        actual_tn['TN'].describe()['75%'], 
                        actual_tn['TN'].describe()['max'], tn_mse, tn_rmse, tn_mae]
results_tn['Predicted'] = [actual_tn['PREDICTED_TN'].describe()['count'], 
                           actual_tn['PREDICTED_TN'].describe()['mean'],
                           actual_tn['PREDICTED_TN'].describe()['std'], 
                           actual_tn['PREDICTED_TN'].describe()['min'], 
                           actual_tn['PREDICTED_TN'].describe()['25%'],
                           actual_tn['PREDICTED_TN'].describe()['50%'],
                           actual_tn['PREDICTED_TN'].describe()['75%'], 
                           actual_tn['PREDICTED_TN'].describe()['max'], None, None, None]
results_tn = results_tn.rename(index={0 : 'Count', 1 : 'Mean', 2 : 'STD', 3 : 'Min',
                             4 : '25%', 5 : '50%', 6 : '75%', 7 : 'Max', 8 : 'MSE', 9 : 'RMSE', 10 : 'MAE'})
display(results_tn)

Unnamed: 0,Actual,Predicted
Count,8717.0,8717.0
Mean,3.121904,3.210036
STD,4.336037,2.589735
Min,0.056,0.578821
25%,1.792,2.0585
50%,2.807,3.034937
75%,4.049,4.15278
Max,245.992,116.156329
MSE,23.653644,
RMSE,4.863501,


In [24]:
results_tn.to_csv(r"C:\Users\killiad\Documents\Senior\REU\Interpolation\tn_analysis_one_by_one.csv")

In [13]:
df['PREDICTED_TP'].notnull().sum()

8540

In [20]:
years = [[x-1, x, x+1] for x in range(1995, 2021)]
seasons = ['SPRING','SUMMER','FALL','WINTER']
result_vel = pd.DataFrame()
df_vel = df[df['VEL'].notnull()].copy()
df["PREDICTED_VEL"] = df['VEL']
for setOfYears in years:
    print("Year: " + str(setOfYears[1]))
    threeYearFrame = df_vel[df_vel['YEAR'].isin(setOfYears)]
    for season in seasons:
        seasonalFrame = threeYearFrame[threeYearFrame['SEASON'] == season]
        if (seasonalFrame.shape[0] > 1):
            seasonalHash = construct_hashtable(seasonalFrame)
            df_year = seasonalFrame.copy()
            df_year = df_year[df_year["YEAR"] == setOfYears[1]]
            print("For " + "VEL" + " we will interpolate " + str(len(df_year)) + " points.")
            for index, row in df_year.iterrows():
                distances, neighbors = k_nearest_neighbors(seasonalFrame, index, "VEL", seasonalHash, 2)
                df.loc[index, "PREDICTED_VEL"] = interpolate(df, distances, neighbors, "VEL")
            print("VEL" + " interpolation success")

Year: 1995
For VEL we will interpolate 0 points.
VEL interpolation success
For VEL we will interpolate 1 points.
VEL interpolation success
For VEL we will interpolate 0 points.
VEL interpolation success
Year: 1996
For VEL we will interpolate 0 points.
VEL interpolation success
For VEL we will interpolate 70 points.
VEL interpolation success
For VEL we will interpolate 82 points.
VEL interpolation success
For VEL we will interpolate 4 points.
VEL interpolation success
Year: 1997
For VEL we will interpolate 133 points.
VEL interpolation success
For VEL we will interpolate 148 points.
VEL interpolation success
For VEL we will interpolate 144 points.
VEL interpolation success
For VEL we will interpolate 110 points.
VEL interpolation success
Year: 1998
For VEL we will interpolate 138 points.
VEL interpolation success
For VEL we will interpolate 149 points.
VEL interpolation success
For VEL we will interpolate 142 points.
VEL interpolation success
For VEL we will interpolate 107 points.
VEL 

In [27]:
actual_vel = df[df['VEL'].notnull()]
vel_mse = sklearn.metrics.mean_squared_error(actual_vel['VEL'], actual_vel['PREDICTED_VEL'])
vel_mae = sklearn.metrics.mean_absolute_error(actual_vel['VEL'], actual_vel['PREDICTED_VEL'])
vel_rmse = math.sqrt(sklearn.metrics.mean_squared_error(actual_vel['VEL'], actual_vel['PREDICTED_VEL']))
results_vel = pd.DataFrame()
results_vel['Actual'] = [actual_vel['VEL'].describe()['count'], 
                        actual_vel['VEL'].describe()['mean'],
                        actual_vel['VEL'].describe()['std'],
                        actual_vel['VEL'].describe()['min'], 
                        actual_vel['VEL'].describe()['25%'], 
                        actual_vel['VEL'].describe()['50%'],
                        actual_vel['VEL'].describe()['75%'], 
                        actual_vel['VEL'].describe()['max'], vel_mse, vel_rmse, vel_mae]
results_vel['Predicted'] = [actual_vel['PREDICTED_VEL'].describe()['count'], 
                           actual_vel['PREDICTED_VEL'].describe()['mean'],
                           actual_vel['PREDICTED_VEL'].describe()['std'], 
                           actual_vel['PREDICTED_VEL'].describe()['min'], 
                           actual_vel['PREDICTED_VEL'].describe()['25%'], 
                           actual_vel['PREDICTED_VEL'].describe()['50%'],
                           actual_vel['PREDICTED_VEL'].describe()['75%'], 
                           actual_vel['PREDICTED_VEL'].describe()['max'], None, None, None]
results_vel = results_vel.rename(index={0 : 'Count', 1 : 'Mean', 2 : 'STD', 3 : 'Min',
                             4 : '25%', 5 : '50%', 6 : '75%', 7 : 'Max', 8 : 'MSE', 9 : 'RMSE', 10 : 'MAE'})
display(results_vel)

Unnamed: 0,Actual,Predicted
Count,9472.0,9472.0
Mean,0.220858,0.220609
STD,0.334544,0.320456
Min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.36,0.386076
Max,1.7,1.506706
MSE,0.045001,
RMSE,0.212134,


In [28]:
results_vel.to_csv(r"C:\Users\killiad\Documents\Senior\REU\Interpolation\vel_analysis_one_by_one.csv")