# **Compile pre-processing outputs into one file**

By Bridget Bittmann

Date created: 04/04/2022

Date modified: 04/04/2022

In [1]:
import pandas as pd
import os 
import glob
import numpy as np


In [2]:
from google.colab import drive 
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
## NAVIGATE TO YOUR DIRECTORY ##
%cd gdrive/MyDrive/spatial_colab/datasets/
%ls

/content/gdrive/MyDrive/spatial_colab/datasets
 [0m[01;34mclimate_stats[0m/                         [01;34mmasked[0m/
 [01;34mdiversion_timeseries[0m/                 'New York Canal_land_change.png'
'Farmers Union Canal_land_change.png'   ny_change.png
 fu_change.png                          [01;34moutput_files[0m/
 [01;34mhydromet_data[0m/                         [01;34mPOUs[0m/
 [01;34mirrigation_companies[0m/                  seb_change.png
 [01;34mirrig_lbrb[0m/                           'Sebree Canal_land_change.png'
 [01;34mIrrMapper[0m/                             set_change.png
 [01;34mLBRB_shp[0m/                             'Settlers Canal_land_change.png'
 [01;34mlcmap_files[0m/                           [01;34msubset_test_shp[0m/


In [4]:
## ------------------- ##
## IMPORT ALL DATASETS ##
## ------------------- ##

div_files = sorted(glob.glob('diversion_timeseries/final_stats/*.csv'))
land_files = sorted(glob.glob('lcmap_files/proportions/longform_proportions/*.csv'))
climate_files = sorted(glob.glob('climate_stats/final/*.csv'))
hydromet = pd.read_csv('hydromet_data/mode_input_hydromet.csv')
common_name = pd.read_csv('diversion_timeseries/relates/name_dictionary.csv')

## Clean the dataframes to match by renaming to create a common name among all three datasets

div_data = []
for i in div_files:
  div = pd.read_csv(i)
  div['DiversionName_x'] = div['DiversionName_x'].str.replace(r"\(.*\)", "")
  if common_name['DiversionName'].str.contains(div['DiversionName_x'][0]).any():
    div['DiversionName_x'] = common_name['NewName'][common_name['DiversionName'].str.contains(div['DiversionName_x'][0])].to_string(index=False)
    div = div.drop(labels='Unnamed: 0', axis=1)
    div = div.rename(columns={'DiversionName_x':'DiversionName'})
    div_data.append(div)
  else:
    None

div_data = div_data[0:65]
div = pd.concat(div_data)

land_data = []
for i in land_files:
  land = pd.read_csv(i)
  new_name = common_name['NewName'][common_name['WaterRight'].str.contains(land['DivName'][0])].reset_index()
  new_name = new_name['NewName'][0]
  land['DivName'] = new_name
  land = land.drop(labels='Unnamed: 0', axis=1)
  land = land.drop(axis=0, index=[0,1]).reset_index().drop(labels=['index'], axis=1)
  land = land.rename(columns={'DivName':'DiversionName'})
  land_data.append(land)


land = pd.concat(land_data)


climate_data = []
for i in climate_files:
  clim = pd.read_csv(i)
  new_name = common_name['NewName'][common_name['WaterRight'].str.contains(clim['DIV_NAME'][0])].reset_index()
  new_name = new_name['NewName'][0]
  clim['DIV_NAME'] = new_name
  clim = clim.drop(labels=['Unnamed: 0'], axis=1).reset_index().drop(labels=['index'], axis=1)
  clim = clim.rename(columns={'DIV_NAME':'DiversionName'})
  climate_data.append(clim)

clim = pd.concat(climate_data)

# ## Check to make sure all the same length
print(len(common_name), len(div_data), len(land_data), len(climate_data))


  app.launch_new_instance()


65 65 64 64


In [5]:
## -------------------------------------- ## 
## MERGE DIVERSION DATA TO MATCH THE POUS ## 
## -------------------------------------- ## 

ship_wag = div[div['DiversionName']=='Shipley and Wagner Pumps']
new_ship_wag = pd.DataFrame()
new_ship_wag['Year'] = ship_wag['Year'].unique()
new_ship_wag['DiversionName'] = ship_wag['DiversionName'][0:34]
sums = ship_wag.groupby('Year').sum().reset_index()
new_ship_wag['CFS'] = sums['CFS']
new_ship_wag['Acre_feet'] = sums['Acre_feet']

startday = []
start_date = []
endday = []
range = []
end_date = []

for i in new_ship_wag['Year']:
  yearly = ship_wag[ship_wag['Year'] == i]
  start = np.min(yearly['StartDayofYear'].values)
  startdate = yearly['StartDate'][yearly['StartDayofYear']==start].values
  end = np.max(yearly['EndDayofYear'].values)
  enddate = yearly['EndDate'][yearly['EndDayofYear']==end].values
  startday.append(start)
  endday.append(end)
  range.append(end-start)
  start_date.append(startdate[0])
  end_date.append(enddate[0])

new_ship_wag['StartDate'] = start_date
new_ship_wag['StartDayofYear'] = startday
new_ship_wag['EndDate'] = end_date
new_ship_wag['EndDayofYear'] = endday
new_ship_wag['Range'] = range

#Remove old dataframes from full dataset
div = div[div['DiversionName'] != str(ship_wag['DiversionName'][2].values)]

In [6]:
## ------------------------------------- ##
## MERGE DIVERSION DATASET WITH NEW DATA ##
## ------------------------------------- ##

div = pd.concat([div, new_ship_wag]).sort_values(by='DiversionName').reset_index().drop('index', axis=1)

In [7]:
## ------------------------------- ## 
## MERGE THREE FILES INTO ONE FILE ##
## ------------------------------- ## 

land_div = div.merge(land, left_on=['Year', 'DiversionName'], right_on=['dates','DiversionName'], how='left')
full_df = land_div.merge(clim, left_on=['Year','DiversionName'], right_on=['Year', 'DiversionName'], how='left').sort_values(by=['DiversionName', 'Year'])
full_df = full_df.merge(hydromet, left_on='Year', right_on='Year', how='left').drop(['Unnamed: 0', 'dates'], axis=1)

## --------------------------------------- ##
## Export the full csv file for model in R ##
## --------------------------------------- ## 

# Full dataframe export
out_path = 'output_files/merged/model_input.csv'
full_df.to_csv(out_path)

# Individual dataframe export

names = full_df['DiversionName'].unique()
for i in names:
  df = full_df[full_df['DiversionName'] == i]
  out_path = os.path.join('output_files/'+i+'.csv')
  df.to_csv(out_path)