## This notebook makes a dummy storage curve as pretend validation data and calculates an r squared value compared to the depth to surface area made from the wofs/depth gauge method

In [1]:
import os
import geopandas as gpd
import pandas as pd
import numpy as np
import six
import matplotlib
import matplotlib.pyplot as plt

  shapely_geos_version, geos_capi_version_string


## Loop over all files and make fake validation data

In [None]:
directory = 'Storage_curves'

#make a list of the file names so we can call them with pandas
file_list = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_list.append(os.path.join(directory, filename))

#Read the gauge files twice, once to get ID and second to get the data. Append them together in a dictionary
#May as well make a list of IDs here because we will use it later       
validation_df_list = []
#let's use tqdm again to make a progress bar. The bar is so cool I love this module
for i in file_list:
    df = pd.read_csv(i)
    # Get gauge ID
    ID = df.loc[0, 'ID']
    type_test = isinstance(ID, str)
    if type_test is False:
        ID = ID.astype(str)

    # Get first and last values
    length = len(df)
    last_n = length -1
    last = df.iloc[last_n]
    first = df.iloc[0]

    last_v = last['Surface Area']
    first_v = first['Surface Area']

    # Change first and last values a little bit
    first_v = first_v + (first_v * 0.05)
    last_v = last_v - (last_v * 0.05)

    # Change all values to 0 
    df['Surface Area'] = 0

    # Except for the first and last value
    df.loc[0, 'Surface Area'] = first_v
    df.loc[last_n, 'Surface Area'] = last_v

    # Interpolate between the changed first and last values
    df.drop(df[df['Surface Area'] == 0].index, inplace=True) #drop zero values
    df2 = df.set_index("Depth")
    min_ = df2.index.min()
    max_ = df2.index.max()
    df3 = df2.reindex(range(min_, max_+1))
    df3['Surface Area'] = df3['Surface Area'].interpolate()
    df3['ID'] = ID
    #df3.to_csv('../dea-notebooks/Supplementary_data/Reservoir_validation_data/'+ID+'_storage_curve.csv' )
    validation_df_list.append(df3)

## Use just one storage curve and make fake validation data

In [19]:
remotely_sensed = pd.read_csv('Storage_curves/419041.csv')

In [20]:
# Get gauge ID
ID = remotely_sensed.loc[0, 'ID']
type_test = isinstance(ID, str)
if type_test is False:
    ID = ID.astype(str)

In [17]:
# Get first and last values
length = len(df)
last_n = length -1
last = remotely_sensed.iloc[last_n]
first = remotely_sensed.iloc[0]

last_v = last['Surface Area']
first_v = first['Surface Area']

# Change first and last values a little bit
first_v = first_v + (first_v * 0.05)
last_v = last_v - (last_v * 0.05)

In [21]:
df = remotely_sensed.copy()


In [22]:
# Change all values to 0 
df['Surface Area'] = 0
# Except for the first and last value
df.loc[0, 'Surface Area'] = first_v
df.loc[last_n, 'Surface Area'] = last_v

# Interpolate between the changed first and last values
df.drop(df[df['Surface Area'] == 0].index, inplace=True) #drop zero values
df2 = df.set_index("Depth")
min_ = df2.index.min()
max_ = df2.index.max()
df3 = df2.reindex(range(min_, max_+1))
df3['Surface Area'] = df3['Surface Area'].interpolate()
df3['ID'] = ID

TypeError: 'float' object cannot be interpreted as an integer

In [12]:
fake = df3
original

Unnamed: 0,ID,Depth,Surface Area
0,419041,306,1861125.0
22,419041,328,39026000.0


In [11]:
#Calculate r squared
real_list = original['Surface Area'].to_list()
fake_list = fake['Surface Area'].to_list()

corr_matrix = np.corrcoef(real_list, fake_list)
corr = corr_matrix[0,1]
R2 = corr**2
R2_str = R2.astype(str)
title = 'The R squared value is ' + R2_str


# plot
original['Depth'] = original.index
fake['Depth'] = fake.index

ax = fake.plot.scatter(x='Depth',
                    y='Surface Area',
                    c='Red')
original.plot.scatter(x='Depth',
                    y='Surface Area',
                    c='DarkBlue',
                    ax=ax, title = title)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2 and the array at index 1 has size 23

## Calculate Chi squared (mean difference)
We need to measure the difference between each point in the remotely sensed data and the 'validation' data, and then report the average difference.

In [None]:
contingency = pd.DataFrame(index=original.index)
contingency['Validation_Surface_Area'] = fake['Surface Area']
contingency['Remotley_Sensed_Surface_Area'] = original['Surface Area']
contingency['difference'] = contingency['Remotley_Sensed_Surface_Area'] - contingency['Validation_Surface_Area']
average_difference = sum(contingency['difference']/len(contingency.index))

print('The average difference between the remotley sensed data and the validation data is:')
print(average_difference)