# **Compile pre-processing outputs into one file**

By Bridget Bittmann

Date created: 04/04/2022

Date modified: 06/01/2022

In [1]:
## --------------- ##
## IMPORT PACKAGES ## 
## --------------- ##

import pandas as pd # to work with dataframe
import os # for file paths
import glob # read in a folder of csv
import numpy as np # basic statistics


In [2]:
from google.colab import drive 
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
## NAVIGATE TO YOUR DIRECTORY ##
%cd gdrive/MyDrive/spatial_colab/datasets/
%ls

/content/gdrive/MyDrive/spatial_colab/datasets
[0m[01;34mclimate_stats[0m/         [01;34mirrigation_companies[0m/  [01;34mlcmap_files[0m/   [01;34msubset_test_shp[0m/
[01;34mdiversion_timeseries[0m/  [01;34mirrig_lbrb[0m/            [01;34mmasked[0m/
[01;34mextra_figures[0m/         [01;34mIrrMapper[0m/             [01;34moutput_files[0m/
[01;34mhydromet_data[0m/         [01;34mLBRB_shp[0m/              [01;34mPOUs[0m/


In [4]:
## -------------------- ##
## IMOPORT ALL DATASETS ##
## -------------------- ##

div_files = sorted(glob.glob('diversion_timeseries/final_stats/*.csv'))
land_files = sorted(glob.glob('lcmap_files/final_metrics/*.csv'))
climate_files = sorted(glob.glob('climate_stats/final/*.csv'))
hydromet = pd.read_csv('hydromet_data/mode_input_hydromet.csv')
common_name_flow = pd.read_csv('diversion_timeseries/relates/name_dictionary_flow.csv')
common_name_spatial = pd.read_csv('diversion_timeseries/relates/name_dictionary_spatial.csv')

In [35]:
## --------------------------------- ##
## CREATE COMMON NAME TO MERGE FILES ## 
## --------------------------------- ##

# Import discharge data 

div_data = []
for i in div_files:
  discharge = pd.read_csv(i)
  discharge['DiversionName_x'] = discharge['DiversionName_x'].str.replace(r"\(.*\)", "")
  div_data.append(discharge)

div = pd.concat(div_data).drop_duplicates('Unnamed: 0', keep='first')
# Dicharge data dictionary

div_dict = dict(zip(common_name_flow['DiversionName'], common_name_flow['NewName']))
div['Name'] = div['DiversionName_x'].map(div_dict)
div = div.drop(['Unnamed: 0', 'DiversionName_x', 'CFS'], axis=1)
display(div)

# Import land use change data

land_data = []
for i in land_files:
  land_data.append(pd.read_csv(i))

land = pd.concat(land_data)

# Land use dictionary

shape_dict = dict(zip(common_name_spatial['WaterRight'], common_name_spatial['NewName']))
land['Name'] = land['DivName'].map(shape_dict)
land = land.dropna().drop(['Unnamed: 0', 'DivName'], axis=1)

# Import climate zonal stats

clim_data = []
for i in climate_files:
  clim_data.append(pd.read_csv(i))
clim = pd.concat(clim_data)

# Use shapefile dictionary on climate data

clim['Name'] = clim['DIV_NAME'].map(shape_dict)
clim = clim.dropna().drop(['Unnamed: 0', 'DIV_NAME'], axis=1)


  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Year,Diversion (cfs),Acre_feet,Month,DayofYear,StartDate,StartDayofYear,EndDate,EndDayofYear,Range,Name
0,1988,1617.5,3208.271828,2384.0,67161.0,1988-04-21,112,1988-10-20,294,182,Andrews
1,1989,1792.0,3554.388325,2382.0,66795.0,1989-04-20,110,1989-10-10,283,173,Andrews
2,1990,1836.5,3642.652990,2382.0,66795.0,1990-04-19,109,1990-10-14,287,178,Andrews
3,1991,3021.0,5992.079872,2382.0,66795.0,1991-04-10,100,1991-10-15,288,188,Andrews
4,1992,1951.0,3869.760950,2384.0,67161.0,1992-04-08,99,1992-10-30,304,205,Andrews
...,...,...,...,...,...,...,...,...,...,...,...
1844,1990,1003.0,1989.426055,2382.0,66795.0,1990-04-04 00:00:00,94,1990-10-14 00:00:00,287,193,Warm Springs Canal
1845,1989,1092.0,2165.955386,2382.0,66795.0,1989-04-11 00:00:00,101,1989-10-08 00:00:00,281,180,Warm Springs Canal
1846,1988,1157.5,2295.873039,2384.0,67161.0,1988-04-01 00:00:00,92,1988-10-02 00:00:00,276,184,Warm Springs Canal
1847,1996,647.1,1283.507079,2384.0,67161.0,1996-04-30 00:00:00,121,1996-10-15 00:00:00,289,168,Warm Springs Canal


In [39]:
## -------------------------------------- ## 
## MERGE DIVERSION DATA TO MATCH THE POUS ## 
## -------------------------------------- ## 


# Create function to do this

def merge_flows(data, name):
  '''
  This function will merge two different flow datasets into one for completely overlapping POUs.

  Variables:
  data : The full diversion dataset
  name : A string of the new name for each POU.
  '''
  old_df = data[data['Name']== name].reset_index().drop('index', axis=1)
  new_df = pd.DataFrame()
  new_df['Year'] = old_df['Year'].unique()
  new_df['Name'] = old_df['Name'][0:34]
  sums = old_df.groupby('Year').sum().reset_index()
  new_df['Diversion (cfs)'] = sums['Diversion (cfs)']
  new_df['Acre_feet'] = sums['Acre_feet']

  startday = []
  start_date = []
  endday = []
  range = []
  end_date = []

  for i in new_df['Year']:
    yearly = old_df[old_df['Year'] == i]
    start = np.min(yearly['StartDayofYear'].values)
    startdate = yearly['StartDate'][yearly['StartDayofYear']==start].values
    end = np.max(yearly['EndDayofYear'].values)
    enddate = yearly['EndDate'][yearly['EndDayofYear']==end].values
    startday.append(start)
    endday.append(end)
    range.append(end-start)
    start_date.append(startdate[0])
    end_date.append(enddate[0])

  new_df['StartDate'] = start_date
  new_df['StartDayofYear'] = startday
  new_df['EndDate'] = end_date
  new_df['EndDayofYear'] = endday
  new_df['Range'] = range

  return new_df

In [40]:
## ------------------------------------- ##
## MERGE DIVERSION DATASET WITH NEW DATA ##
## ------------------------------------- ##

# Create a list of names that have completely shared POUs
merge_names = ['Shipley and Wagner Pumps', 'Rossi Mill and Meeves Canal', 'Boise City Parks']

merged = []
for i in merge_names:
  new = merge_flows(div, i)
  div = div[div['Name'] != i] #Remove old dataframes from full dataset
  div = pd.concat([div, new])
  merged.append(new)

div = div.sort_values(by=['Name', 'Year']).reset_index().drop('index',axis=1)

In [41]:
## ------------------------------- ## 
## MERGE THREE FILES INTO ONE FILE ##
## ------------------------------- ## 

land_div = div.merge(land, left_on=['Year', 'Name'], right_on=['dates','Name'], how='left')
full_df = land_div.merge(clim, left_on=['Year','Name'], right_on=['Year', 'Name'], how='left').sort_values(by=['Name', 'Year'])
full_df = full_df.merge(hydromet, left_on='Year', right_on='Year', how='left').drop(['Unnamed: 0', 'dates'], axis=1)

## --------------------------------------- ##
## Export the full csv file for model in R ##
## --------------------------------------- ## 

# Full dataframe export
out_path = 'output_files/merged/model_input.csv'
full_df.to_csv(out_path)

# Individual dataframe export

names = full_df['Name'].unique()
for i in names:
  df = full_df[full_df['Name'] == i]
  out_path = os.path.join('output_files/'+i+'.csv')
  df.to_csv(out_path)