# Data manipulation for establishing meander wavelength to discharge relationship 

In [2]:
#import packages 
import data_methods as dm
import vis_methods as vm 
import os 
import pandas as pd

In [3]:
#file paths 
ra_path = r'D:\CDS River Discharge\Pickles\river_analytics.pickle' #40 year annual GFAS mean/max/min 
compressed_discharge_path = r'D:\CDS River Discharge\Pickles\compressed_ra.pickle' #compressed version of above file 

#Frasson data directory 
frasson_dir = r'C:\Users\bydd1\OneDrive\Documents\Research\MS Sinuosity Data'

frasson_path = os.path.join(frasson_dir, 'MS_segments_recovered.xlsx') #Frasson file 
#frasson_path = os.path.join(frasson_dir, 'Amazon Segments\Seg1.xlsx')
#segmented_frasson_path = r'C:\Users\bydd1\OneDrive\Documents\Research\MS Sinuosity Data\Amazon Segments\Seg1_mean_dis.xlsx'
segmented_frasson_path = r'C:\Users\bydd1\OneDrive\Documents\Research\MS Sinuosity Data\MS_segments_averaged_by_seg.xlsx' #Frasson, averaged by segment

## Data inputs
This program relies on two data products
- river_analytics.pickle : a pickle file containing 40 year annual values of mean/max/min discharge from GFAS generated by the copernicus_data_extraction.py script
- MS_segments.xlsx : data products from the Frasson et al paper

## Generate compressed version of river_analytics.pickle
river_analytics.pickle is computationally difficult to handle due to its large size (5GB, thanks 0.1degx0.1deg res), so we average it temporally across all years to generate compressed_ra.pickle 
This will print how long it takes (84 seconds on Brynn's laptop)

In [3]:
dm.compress_ra(ra_path, compressed_discharge_path) 

--- 145.28918361663818 seconds ---


## Add mean_dis, max_dis, and min_dis (and natural logged values) to Frasson file
This method reads the Frasson file, and then iterates through every lat/lon pair, finds the nearest neighbor within the compressed GFAS dataset generated above, and adds a column with mean/max/min data. 
Also adds columns for the natural log of meander wavelength, sinuosity, QWBM (Frasson's calculated discharge), and GFAS' mean/max/min. 
All columns with nan-values are also dropped here 

This will print a progress statement (once every 1000 values are processed, 55000 values in original dataset)

In [4]:
dm.assign_cop_to_latlon(segmented_frasson_path, compressed_discharge_path)

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\CDS River Discharge\\Pickles\\compressed_ra.pickle'

## Create segmented Excel file 
Average the original Frasson file by unique SegmentID 


In [None]:
dm.segment_frasson(frasson_path, segmented_frasson_path)

## Perform basic analytics on segmented Frasson file 

In [None]:
df = pd.read_excel(segmented_frasson_path)
vm.plot_lat_lon_data(df['lat'], df['lon'], df['log_mean_dis'], 'Mean Discharge, Averaged by Segment', r'ln(discharge) $[m^3/s]$')

In [None]:
x_vals_corr = ['log_mean_dis', 'log_max_dis', 'log_min_dis', 'log_QWBM']
y_vals_corr = ['log_sinuosity', 'log_mw','Slope','Width']
dm.perform_correlations(x_vals_corr, y_vals_corr, df)

In [None]:
x_vals_corr = ['log_mean_dis', 'log_max_dis', 'log_min_dis', 'log_QWBM', 'Slope','Width']
y_vals_corr = ['log_sinuosity', 'log_mw']
dm.perform_correlations(x_vals_corr, y_vals_corr, df)