# **Compile pre-processing outputs into one file**

By Bridget Bittmann

Date created: 04/04/2022

Date modified: 06/01/2022

In [1]:
## --------------- ##
## IMPORT PACKAGES ## 
## --------------- ##

import pandas as pd # to work with dataframe
import os # for file paths
import glob # read in a folder of csv
import numpy as np # basic statistics


In [2]:
from google.colab import drive 
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
## NAVIGATE TO YOUR DIRECTORY ##
%cd gdrive/MyDrive/spatial_colab/datasets/
%ls

/content/gdrive/MyDrive/spatial_colab/datasets
[0m[01;34mclimate_stats[0m/         [01;34mirrigation_companies[0m/  [01;34mlcmap_files[0m/   [01;34msubset_test_shp[0m/
[01;34mdiversion_timeseries[0m/  [01;34mirrig_lbrb[0m/            [01;34mmasked[0m/
[01;34mextra_figures[0m/         [01;34mIrrMapper[0m/             [01;34moutput_files[0m/
[01;34mhydromet_data[0m/         [01;34mLBRB_shp[0m/              [01;34mPOUs[0m/


In [12]:
## -------------------- ##
## IMOPORT ALL DATASETS ##
## -------------------- ##

div_files = pd.read_csv('diversion_timeseries/final_stats/model_inputs_102622.csv')
land_files = sorted(glob.glob('lcmap_files/final_metrics/*.csv'))
land_bpbc = sorted(glob.glob('lcmap_files/final_metrics/bpbc/*.csv'))
climate_files = sorted(glob.glob('climate_stats/final/*.csv'))
climate_bpbc = sorted(glob.glob('climate_stats/bpbc_final/*.csv'))
hydromet = pd.read_csv('hydromet_data/mode_input_hydromet.csv')
common_name_flow = pd.read_csv('diversion_timeseries/relates/name_dictionary_flow.csv')
common_name_spatial = pd.read_csv('diversion_timeseries/relates/name_dictionary_spatial.csv')
POUSize = pd.read_csv('diversion_timeseries/relates/POUSize.csv')
storage = pd.read_csv('diversion_timeseries/storage_100322/WRA_BoiseBasin.csv')
quantiles = pd.read_csv('diversion_timeseries/final_stats/accounting/quantiles.csv')

In [13]:
## --------------------------------- ##
## CREATE COMMON NAME TO MERGE FILES ## 
## --------------------------------- ##

# Import discharge data 

# div_data = []
# for i in div_files:
#   discharge = pd.read_csv(i)
#   discharge = discharge.rename({'Name':'DivName'}, axis=1)
#   discharge['DivName'] = discharge['DivName'].str.replace(r"\(.*\)", "")
#   div_data.append(discharge)

# bpbc = pd.read_csv('diversion_timeseries/bpbc/bpbc_totals.csv')
# bpbc = bpbc.rename({'Acre-feet':'Acre_feet'},axis=1)
div = div_files
# div_bpbc = pd.concat([div,bpbc])
# div_bpbc = div_bpbc.iloc[:,[1,2,3]]
# Dicharge data dictionary

div_dict = dict(zip(common_name_flow['DiversionName'], common_name_flow['NewName']))
div['NewName'] = div['Name'].map(div_dict)
div = div.drop(['Unnamed: 0', 'Name'], axis=1).rename({'NewName' : 'Name'}, axis = 1)
# quantiles['NewName'] = quantiles['Name'].map(div_dict)
# quantiles = quantiles.drop('Name', axis=1).rename({'NewName' : 'Name'}, axis = 1)
# quantiles = quantiles[quantiles['Quantiles'] != 0]
# div = quantiles.merge(div, left_on = ['Name', 'Year'], right_on = ['Name', 'Year'])

# Import land use change data

land_data = []
ld_bpbc = []
for i in land_files:
  land_data.append(pd.read_csv(i))
for i in land_bpbc:
  ld_bpbc.append(pd.read_csv(i))

land_bpbc = pd.concat(ld_bpbc)
land = pd.concat(land_data)

# Land use dictionary

shape_dict = dict(zip(common_name_spatial['WaterRight'], common_name_spatial['NewName']))
land['Name'] = land['DivName'].map(shape_dict)
land = land.drop(['Unnamed: 0', 'DivName'], axis=1)
POUSize['Name'] = POUSize['WaterRight'].map(shape_dict)
POUSize = POUSize.drop(['WaterRight'], axis=1)
# Import climate zonal stats

clim_data = []
clim_bpbc = []
for i in climate_files:
  clim_data.append(pd.read_csv(i))
# for i in climate_bpbc:
#   clim_bpbc.append(pd.read_csv(i))
clim = pd.concat(clim_data)
# climate_bpbc = pd.concat(clim_bpbc)

# Use shapefile dictionary on climate data

clim['Name'] = clim['DIV_NAME'].map(shape_dict)
clim = clim.drop(['Unnamed: 0', 'DIV_NAME'], axis=1)

In [14]:
## -------------------------------------- ## 
## MERGE DIVERSION DATA TO MATCH THE POUS ## 
## -------------------------------------- ## 


# Create function to do this

def merge_flows(data, name):
  '''
  This function will merge two different flow datasets into one for completely overlapping POUs.

  Variables:
  data : The full diversion dataset
  name : A string of the new name for each POU.
  '''
  old_df = data[data['Name']== name].reset_index().drop('index', axis=1)
  new_df = pd.DataFrame()
  new_df['Year'] = old_df['Year'].unique()
  new_df['Name'] = old_df['Name'][0:34]
  sums = old_df.groupby('Year').sum().reset_index()
  new_df['Diversion (cfs)'] = sums['Diversion (cfs)']
  new_df['Acre_feet'] = sums['Acre_feet']
  # quants = old_df['Quantiles'].groupby('Year').max().reset_index()

  startday = []
  start_date = []
  endday = []
  range = []
  end_date = []

  for i in new_df['Year']:
    yearly = old_df[old_df['Year'] == i]
    start = np.min(yearly['StartDayofYear'].values)
    startdate = yearly['StartDate'][yearly['StartDayofYear']==start].values
    end = np.max(yearly['EndDayofYear'].values)
    enddate = yearly['EndDate'][yearly['EndDayofYear']==end].values
    startday.append(start)
    endday.append(end)
    range.append(end-start)
    start_date.append(startdate[0])
    end_date.append(enddate[0])

  new_df['StartDate'] = start_date
  new_df['StartDayofYear'] = startday
  new_df['EndDate'] = end_date
  new_df['EndDayofYear'] = endday
  new_df['Range'] = range

  return new_df

In [15]:
## ------------------------------------- ##
## MERGE DIVERSION DATASET WITH NEW DATA ##
## ------------------------------------- ##

# Create a list of names that have completely shared POUs
merge_names = ['Shipley and Wagner Pumps', 'Rossi Mill and Meeves Canals', 'Boise City Parks']

merged = []
for i in merge_names:
  new = merge_flows(div, i)
  div = div[div['Name'] != i] #Remove old dataframes from full dataset
  div = pd.concat([div, new])
  merged.append(new)

div = div.sort_values(by=['Name', 'Year']).reset_index().drop('index',axis=1)

In [None]:
## BPBC Merging ##

relates = pd.read_csv('diversion_timeseries/bpbc/bpbc_relate.csv')

# Dicharge data dict
key_list = list(bpbc['DiversionNa'])
dict_lookup = dict(zip(relates['Discharge'], relates['NewName']))
bpbc['Name'] = [dict_lookup[item] for item in key_list]

# Land use change dict
key_list2 = list(land_bpbc['DivName'])
dict_lookup2 = dict(zip(relates['Shape'], relates['NewName']))
land_bpbc['Name'] = [dict_lookup2[item] for item in key_list2]

key_list3 = list(climate_bpbc['DIV_NAME'])
dict_lookup3 = dict(zip(relates['Shape'], relates['NewName']))
climate_bpbc['Name'] = [dict_lookup2[item] for item in key_list2]

## Flow data
bpbc = bpbc.drop('DiversionNa', axis=1)
div_bpbc = pd.concat([div, bpbc])
all_div = pd.DataFrame(div_bpbc[['Year', 'Name', 'Acre_feet']])
all_div = all_div.sort_values(by=['Name', 'Year'])

## Land use data
land_bpbc = land_bpbc.drop(['Unnamed: 0', 'DivName'], axis=1)
all_land = pd.concat([land_bpbc,land])

## Climate data
climate_bpbc = climate_bpbc.drop(['Unnamed: 0','DIV_NAME'], axis=1)
all_clim = pd.concat([climate_bpbc, clim])

NameError: ignored

In [18]:
## ------------------------------- ## 
## MERGE THREE FILES INTO ONE FILE ##
## ------------------------------- ## 

land_div = div.merge(land, left_on=['Year', 'Name'], right_on=['dates','Name'], how='left')
full_df = land_div.merge(clim, left_on=['Year','Name'], right_on=['Year', 'Name'], how='left').sort_values(by=['Name', 'Year'])
full_df = full_df.merge(hydromet, left_on='Year', right_on='Year', how='left').drop(['Unnamed: 0', 'dates'], axis=1)
full_df = full_df.merge(POUSize, left_on = 'Name', right_on = 'Name', how = 'left')
full_df = full_df.merge(storage, left_on = ['Year', 'Name'], right_on = ['Year', 'Name'])
# full_df = quantiles.merge(full_df, left_on = ['Year', 'Name'], right_on = ['Year', 'Name'])
## --------------------------------------- ##
## Export the full csv file for model in R ##
## --------------------------------------- ## 

# Full dataframe export
out_path = 'output_files/merged/model_input_102622.csv'
full_df.to_csv(out_path)

# Individual dataframe export

names = full_df['Name'].unique()
for i in names:
  df = full_df[full_df['Name'] == i]
  out_path = os.path.join('output_files/'+i+'.csv')
  df.to_csv(out_path)

In [None]:
## ------------------------------------ ## 
## MERGE THREE FILES INTO ONE FILE BPBC ##
## ------------------------------------ ## 

land_div = all_div.merge(all_land, left_on=['Year', 'Name'], right_on=['dates','Name'], how='left')
full_df = land_div.merge(all_clim, left_on=['Year','Name'], right_on=['Year', 'Name'], how='left').sort_values(by=['Name', 'Year'])
full_df = full_df.merge(hydromet, left_on='Year', right_on='Year', how='left').drop(['Unnamed: 0', 'dates'], axis=1)

# Get rid of New York data because using BPBC data
full_df = full_df[full_df['Name'] != 'New York Canal']
print(full_df['Name'].unique())
display(full_df)
## --------------------------------------- ##
## Export the full csv file for model in R ##
## --------------------------------------- ## 

# Full dataframe export
out_path = 'output_files/merged/bpbc_model_input.csv'
full_df.to_csv(out_path)