# Inverse Distance Weighted (IDW) Interpolation
## Using 2 neighbors within the same year and season
### Written and compiled by Casey McKean

### Imports

In [5]:
# We use pandas to work with our datasets
import pandas as pd
# We use geopy to calculate the distance between latitude and longitude coordinates within the dataset
from geopy import distance
import sklearn.metrics
import time

### Functions and Classes

In [6]:
class Sample:
    """This class stores various information about a sample.
    
    The attributes of this class are:
    - latitude
    - longitude
    - hasv: a boolean indicating if this sample has a recorded value of the desired variable we are interpolating
    - value: the recorded value of the variable we are interpolating
    - SHEET: the unique SHEETBAR code of this sample
    """
    
    def __init__(self,latitude,longitude,hasv,value,SHEET):
        self.SHEET = SHEET
        self.latitude = latitude
        self.longitude = longitude
        self.hasv = hasv
        self.value = value
        
    def __str__(self):
        return str(self.SHEET)
    
def getdist(S1,S2):
    """Calculates the distance betwen 2 samples in kilometers
    
    Parameters
    ----------
    S1: Sample object
    S2: Sample object
    
    RETURN: float representing distance in kilometers
    """
    coords_1 = (S1.latitude, S1.longitude)
    coords_2 = (S2.latitude, S2.longitude)
    dist = distance.distance(coords_1, coords_2).km
    return dist

def DistanceMatrix(dataframe,variable):
    """Returns a pandas dataframe representing a distance matrix in kilometers between all samples
    
    The indices and columns of the distance matrix are sample objects. This is done to make it easier
    to find the nearest samples that also have recorded values for the variable we are interpolating.
    
    Parameters
    ----------
    dataframe: the pandas dataframe of the dataset we are interpolating
        PRE: all samples within this datframe have a unique SHEETBAR code
        PRE: dataframe has 'SHEETBAR' 'LATITUDE' 'LONGITUDE' column names
    variable: the variable we are interpolating as a string
    
    RETURN: pandas dataframe
    """
    
    # Checking that all samples are unique within this dataframe
    numunique = len(dataframe["SHEETBAR"].unique())
    numlocations = dataframe.shape[0]
    try:
        assert(numunique == numlocations), f"{numunique} unique samples but {numlocations} number of samples"
    
    except AssertionError as msg:
        print(dataframe[dataframe["SHEETBAR"].duplicated(keep=False)])
        print(msg)
        
    # the list of sample objects
    samples = []
    for index,row in dataframe.iterrows():
        # make a sample object on this row
        if pd.isnull(row[variable]):
            hasv = False
        else:
            hasv = True
        # Make a sample for each row in the dataframe
        samples.append(Sample(row["LATITUDE"],row["LONGITUDE"],hasv,row[variable],row["SHEETBAR"]))
    
    # Initialize the distance matrix to all 0's, with our samples as indices and columns
    matrix = pd.DataFrame(0,index=samples,columns=samples)
    
    # Fill the distance matrix
    for ci,column in enumerate(samples):
        for ri,row in enumerate(samples):
            # If row index is larger than the column index
            if ri>ci:
                # compute distance between column and row
                dist = getdist(row,column)
            # If column index is larger than row index
            elif ci>ri:
                # We already computed this distance, use symmetry
                dist = matrix.iloc[ci,ri]
            else:
                # The distance between the same sample is 0
                continue
            # Assign the distance to the current row and column
            matrix.iloc[ri,ci] = dist
    return matrix

def changeVar(DM,dataframe,variable):
    """This changes the variable information of a distance matrix
    
    This method saves computation time so new distance matrices aren't needed
    when switching from interoplating one variable to interpolating another variable.
    
    Parameters
    ----------
    DM: pandas datframe of distance matrix
    dataframe: the pandas dataframe we are interpolating
    variable: the variable we are updating the distance matrix with
    
    RETURN: nothing
    """
    samples = DM.index
    # Loop through each sample
    for i,sample in enumerate(samples):
        SHEET = sample.SHEET
        row = dataframe.loc[dataframe["SHEETBAR"]==SHEET]
        
        # Asserting there is only 1 row with this sheetbar
        try:
            assert(row.shape[0]==1), "Multiple rows with same SHEETBAR"
        except AssertionError as msg:
            print(dataframe[dataframe["SHEETBAR"].duplicated(keep=False)])
            print(msg)
            
        # Pull value of desired variable
        val = row.loc[row.index[0],variable]
        if pd.isnull(val):
            samples[i].hasv = False
            samples[i].value = None
        else:
            samples[i].hasv = True
            samples[i].value = val
            
    DM.index = samples
    DM.columns = samples
    
def getclosest(numclosest,DM,sample):
    """Returns a given number of closest samples with non-null values to a sample
    
    The samples returned are the closest samples, and they have recorded values
    for the current variable we are interpolating.
    
    Parameters
    ----------
    numclosest: an integer telling how many samples to return
    DM: a pandas dataframe of a distance matrix to get the samples from
    sample: the sample object we want to find neighbors for
    
    RETURN: pandas series
    """
    # Pull out the column of of the distance matrix for this sample
    column = DM.loc[:,sample].copy()
    
    # The list of samples that don't have the desired variable
    doesnthavev = []
    for i in range(len(column)):
        if not column.index[i].hasv:
            doesnthavev.append(column.index[i])
            
    # Get rid of locations that dont have the desired variable
    column.drop(doesnthavev,inplace = True)
    # Get rid of the location we are predicting for if it exists
    column.drop(sample,inplace = True,errors="ignore")
    # Sort the column based on distances
    column.sort_values(inplace = True)
    
    # Return the first numclosest samples
    return column.iloc[0:numclosest]

def makeDict(DM,numclosest,testing):
    """
    Returns a dictonary with keys as sheetbar codes that need predicting
    and values as a list of tuples containing prediction information. The structure
    of the tuples are (SHEETBAR,distance,value)
    
    Parameters
    ----------
    DM: a pandas dataframe as a distance matrix
    numclosest: an integer telling the number of locations used to predict
    testing: a boolean indicating whether we should predict for samples that already have recorded values
    
    RETURN: python dictionary
    """
    # Initialize the dictionary
    closestDict = {}
    # Loop through each sample in the distance matrix
    for sample in DM.columns:
        if not sample.hasv or testing:
            # Get the closest samples to sample THAT ISN'T SAMPLE
            closest = getclosest(numclosest,DM,sample)
            # The list of tuples that contain sample sheetbar, the distance, and the value for variable
            tuples = []
            for i,dist in enumerate(closest):
                SHEET = closest.index[i].SHEET
                val = closest.index[i].value
                tuples.append((SHEET,dist,val))
            closestDict[sample.SHEET] = tuples
    return closestDict

def predict(tuples,numclosest = 2):
    """Returns the prediction for a sample
    
    Currently only implemented for using 2 samples to make a prediction
    
    Parameters
    ----------
    tuples: the list (SHEETBAR,distance,value) tuples needed for prediciton
    numclosest: an integer telling how many samples are used to predict
    
    RETURN: float representing the predicted value
    """
    loc2 = tuples[0]
    loc3 = tuples[1]
    d12 = loc2[1]
    val2 = loc2[2]
    d13 = loc3[1]
    val3 = loc3[2]
    
    if d12 == d13:
        return 0.5*val2+0.5*val3
    elif d12 == 0:
        return val2
    elif d13 == 0:
        return val3
    
    else:
        c2 = d12/(d12+d13)
        c3 = d13/(d12+d13)
        predicted = c2*val2+c3*val3
    
        return predicted
    

def IDW_interpolate(data,missing_vars,numlocations = 2,testing = False,verbosity = 0):
    '''This method does all the heavy lifting for appending interpolated columns to the dataframe
    
    Parameters
    ----------
    data: the pandas dataframe that is ready to interpolate missing values
        PRE: MUST HAVE "LATITUDE", "LONGITUDE","YEAR", "SEASON", "SHEETBAR", "FLDNUM" columns
    missing_vars: the list of column names (as strings) of the dataframe that we should interpolate
    numlocations: the number of locations used to predict the new value, default = 2 (currently the only option implemented)
    testing: boolean telling whether to predict for values that are already recorded, defalut = False
    verbosity: an integer telling how much output to be printed during interpolation process, default = 0
    
    
    RETURN - a pandas dataframe with extra columns saying the predicted values of the missing_vars
    '''
    
    print("Building a new dataframe with predicted values")
    start_time = time.time()

    years = data["YEAR"].unique()
    seasons = data["SEASON"].unique()
    pools = data["FLDNUM"].unique()
    
    # Initialize new dataframe which will contain interpolated columns
    data_prediction = pd.DataFrame()
    
    # Predictions are made within pools, within years, and within seasons
    for pool in pools:
        for year in years:
            for season in seasons:
                if verbosity > 0:
                    print(f"Appending predicted data for {year}  {season}  FLDNUM {pool}")
                # curset is the current set of rows we are predicting for
                curset = data[(data["YEAR"]==year) & (data["SEASON"]==season) & (data["FLDNUM"]==pool)].copy()
                
                if verbosity > 1:
                    print("Size of this year and season:", curset.shape)
                
                # Boolean to indicate if variable in Distance matrix needs updating
                first = True
                for var in missing_vars:
                    newcolumn = "PREDICTED_"+var
                    curset[newcolumn] = 0
                
                    #check to see if there are enough valid locations
                    # that can be used to predict
                    if not testing:
                        bad = bool((curset[var].notnull().sum()<numlocations))
                    else:
                        bad = bool((curset[var].notnull().sum()<numlocations+1))
                    
                    
                    if(bad):
                        if verbosity > 2:
                            print("Less than "+str(numlocations)+" locations have "+var+" in this set, dropping rows without "+var)
                        curset = curset[curset[var].notnull()]
                        curset[newcolumn] = curset[var]
                        if verbosity > 2:
                            print("Current set is now ",curset.shape)
                    else:
                        if first:
                            if verbosity > 2:
                                print("Creating DM with ",var)
                            DM = DistanceMatrix(curset,var)
                            first = False
                        else:
                            if verbosity > 2:
                                print("Changing to ",var)
                            changeVar(DM,curset,var)
                            
                        # Returns a dictionary mapping each location code to a tuple with prediction information
                        Dict = makeDict(DM,numlocations,testing)
                        
                        # Put predictions into curset
                        for index,row in curset.iterrows():
                            if pd.isnull(row[var]) or testing:
                                try:
                                    prediction = predict(Dict[row["SHEETBAR"]])
                                    curset.loc[index,newcolumn] = prediction
                                except ZeroDivisionError:
                                    print("Couldn't predict for ", str(row["SHEETBAR"]))
                                    print(Dict[row["SHEETBAR"]])
                                    curset.loc[index,newcolumn] = None
                            else:
                                curset.loc[index,newcolumn] = row[var]
                                
                # Append the predicted subset to the new dataframe
                data_prediction = data_prediction.append(curset,ignore_index=True)  
    
    print("Final data set size is ",data_prediction.shape)
    print(f"Interpolating took {(time.time()-start_time)/60} minutes")
    return data_prediction

print("Functions Loaded")

Functions Loaded


### Load and Prepare Data

The data should already have been cleaned and appended latitude and longitude columns to it.

In [7]:
filepath = r"..\LTRM data\water_data_qfneg.csv"
water_data = pd.read_csv(filepath, low_memory = False)
water_data.head()

Unnamed: 0,SHEETBAR,DATE,LATITUDE,LONGITUDE,FLDNUM,STRATUM,LOCATCD,TN,TP,TEMP,DO,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI
0,41000065,07/26/1993,44.571864,-92.51097,1,1,9312103,,,23.0,6.6,28.0,550.0,,42.3,2.2,9.44875,40.0
1,41000066,07/26/1993,44.575497,-92.518497,1,1,9312002,4.876,0.229,23.0,6.6,28.0,554.0,,37.6,8.2,8.2423,42.0
2,41000067,07/26/1993,44.573718,-92.523549,1,1,9312102,,,22.9,6.3,24.0,564.0,,34.1,4.3,8.72488,43.0
3,41000068,07/26/1993,44.566588,-92.541238,1,1,9312003,4.257,0.212,22.9,6.4,28.0,563.0,,33.4,9.1,8.48359,38.0
4,41000069,07/26/1993,44.568419,-92.54878,1,1,9312104,,,23.0,6.6,33.0,556.0,,48.0,6.7,9.52918,45.0


We need to create year and season columns on the dataframe for this interpolation method.

In [8]:
water_data["MONTH"] = pd.DatetimeIndex(water_data["DATE"]).month
water_data["YEAR"] = pd.DatetimeIndex(water_data["DATE"]).year
water_data["SEASON"] = water_data["MONTH"]

# Dictionary mapping month to season
seasons = {3 : 'SPRING',
           4 : 'SPRING',
           5 : 'SPRING',
           6 : 'SUMMER',
           7 : 'SUMMER',
           8 : 'SUMMER',
           9 : 'FALL',
           10 : 'FALL',
           11: 'FALL',
           12: 'WINTER',
           1: 'WINTER',
           2: 'WINTER'}

water_data = water_data.replace({"SEASON" : seasons})
water_data.head()

Unnamed: 0,SHEETBAR,DATE,LATITUDE,LONGITUDE,FLDNUM,STRATUM,LOCATCD,TN,TP,TEMP,...,TURB,COND,VEL,SS,WDP,CHLcal,SECCHI,MONTH,YEAR,SEASON
0,41000065,07/26/1993,44.571864,-92.51097,1,1,9312103,,,23.0,...,28.0,550.0,,42.3,2.2,9.44875,40.0,7,1993,SUMMER
1,41000066,07/26/1993,44.575497,-92.518497,1,1,9312002,4.876,0.229,23.0,...,28.0,554.0,,37.6,8.2,8.2423,42.0,7,1993,SUMMER
2,41000067,07/26/1993,44.573718,-92.523549,1,1,9312102,,,22.9,...,24.0,564.0,,34.1,4.3,8.72488,43.0,7,1993,SUMMER
3,41000068,07/26/1993,44.566588,-92.541238,1,1,9312003,4.257,0.212,22.9,...,28.0,563.0,,33.4,9.1,8.48359,38.0,7,1993,SUMMER
4,41000069,07/26/1993,44.568419,-92.54878,1,1,9312104,,,23.0,...,33.0,556.0,,48.0,6.7,9.52918,45.0,7,1993,SUMMER


There were extreme outliers in TN that caused high errors in model performance, and it was decided to remove these values. Killian's IDW using 3 years did not do this, and that is why his TN MAE and RMSE are much higher than this IDW. Random Forrest, Multivariate Polynomial Regression did the same. I believe but I am not sure that Regression Trees did this as well.

In [9]:
water_data["TN"].sort_values(ascending=False)

46795    187.429
46545    165.177
46727    149.712
29136     46.989
55650     32.965
          ...   
82475        NaN
82477        NaN
82478        NaN
82479        NaN
82480        NaN
Name: TN, Length: 82481, dtype: float64

In [10]:
water_data.loc[46795,"TN"] = None
water_data.loc[46545,"TN"] = None
water_data.loc[46727,"TN"] = None
water_data["TN"].sort_values(ascending=False)

29136    46.989
55650    32.965
19178    22.939
55611    22.677
59504    22.576
          ...  
82475       NaN
82477       NaN
82478       NaN
82479       NaN
82480       NaN
Name: TN, Length: 82481, dtype: float64

### Testing Method on TP, TN, and VEL

In [None]:
variables = ["TN","TP","VEL"]
print("\n\nTesting by year, by season IDW interpolation")
for var in variables:
    print("\n-----------------------------------")
    print("Testing ",var)
    # Filter by locations that we already have for this variable
    water_test = water_data[water_data[var].notna()]
    
    # Increase verbosity for more output, make verbosity 0 to report only the errors
    water_test_interpolated = IDW_interpolate(water_test,[var],testing=True,verbosity=0)
    
    # Save the interpolated dataset
    path = "Interpolation_analysis_datasets\\"+var
    #pickle.dump(water_test_interpolated,open(path+"_interpolated.p","wb"))
    
    
    # Get name of predicted column
    newcol = "PREDICTED_"+var
    
    MAE = sklearn.metrics.mean_absolute_error(water_test_interpolated[var],water_test_interpolated[newcol])
    RMSE = sklearn.metrics.mean_squared_error(water_test_interpolated[var],water_test_interpolated[newcol],squared=False)
    print(f"The MAE for {var} is {MAE:8f}")
    print(f"The RMSE for {var} is {RMSE:8f}")
    
    # Make error column names
    error_col = var+" error"
    squared_error_col = var+" squared error"
    water_test_interpolated[error_col] = round(abs(water_test_interpolated[var] - water_test_interpolated[newcol]),6)
    water_test_interpolated[squared_error_col] = round((water_test_interpolated[var] - water_test_interpolated[newcol])**2,6)
    print(water_test_interpolated[error_col].describe())
    print(water_test_interpolated[squared_error_col].describe())



Testing by year, by season IDW interpolation

-----------------------------------
Testing  TN
Building a new dataframe with predicted values
Final data set size is  (32185, 22)
Interpolating took 8.268683723608653 minutes
The MAE for TN is 0.404327
The RMSE for TN is 0.862958
count    32185.000000
mean         0.404327
std          0.762388
min          0.000000
25%          0.072140
50%          0.183478
75%          0.457552
max         41.981454
Name: TN error, dtype: float64
count    32185.000000
mean         0.744697
std         12.720244
min          0.000000
25%          0.005204
50%          0.033664
75%          0.209354
max       1762.442480
Name: TN squared error, dtype: float64

-----------------------------------
Testing  TP
Building a new dataframe with predicted values
Final data set size is  (31450, 22)
Interpolating took 8.833173775672913 minutes
The MAE for TP is 0.046904
The RMSE for TP is 0.146266
count    31450.000000
mean         0.046904
std          0.138544
m