### Merge Upstream Downstream with FAO names 

* Purpose of script: Create a shapefile and csv file with both the upstream / downstream relation and the FAO basin names
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20170829

In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M11D16 UTC 11:56


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
S3_INPUT_PATH_FAO ="s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output/"
S3_INPUT_PATH_DOWNSTREAM = "s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Downstream_V01/output/"
S3_INPUT_PATH_HYBAS = "s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V01/output/"

SCRIPT_NAME = "Y2017M08D29_RH_Merge_FAONames_Upstream_V01"

OUTPUT_VERSION = 3

INPUT_FILE_NAME_FAO = "hybas_lev06_v1c_merged_fiona_withFAO_V01.csv"
INPUT_FILE_NAME_DOWNSTREAM = "hybas_lev06_v1c_merged_fiona_upstream_downstream_V01.csv"
INPUT_FILE_NAME_HYBAS = "hybas_lev06_v1c_merged_fiona_V01.shp"

EC2_INPUT_PATH = "/volumes/data/%s/input/" %(SCRIPT_NAME)
EC2_OUTPUT_PATH = "/volumes/data/%s/output/" %(SCRIPT_NAME)

OUTPUT_FILE_NAME = "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V%0.2d" %(OUTPUT_VERSION)

S3_OUTPUT_PATH = "s3://wri-projects/Aqueduct30/processData/%s/output/" %(SCRIPT_NAME)

In [3]:
!rm -r {EC2_INPUT_PATH}
!rm -r {EC2_OUTPUT_PATH}

!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [4]:
!aws s3 cp {S3_INPUT_PATH_FAO} {EC2_INPUT_PATH} --recursive 

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D25_RH_spatial_join_FAONames_V01/output/hybas_lev06_v1c_merged_fiona_withFAO_V01.csv to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev06_v1c_merged_fiona_withFAO_V01.csv


In [5]:
!aws s3 cp {S3_INPUT_PATH_DOWNSTREAM} {EC2_INPUT_PATH} --recursive 

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D23_RH_Downstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_V01.csv to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_V01.csv


In [6]:
!aws s3 cp {S3_INPUT_PATH_HYBAS} {EC2_INPUT_PATH} --recursive --exclude *.tif

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V01/output/hybas_lev00_v1c_merged_fiona_V01.cpg to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev00_v1c_merged_fiona_V01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V01/output/hybas_lev06_v1c_merged_fiona_V01.cpg to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev06_v1c_merged_fiona_V01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V01/output/hybas_lev00_v1c_merged_fiona_V01.prj to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev00_v1c_merged_fiona_V01.prj
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D02_RH_Merge_HydroBasins_V01/output/hybas_lev06_v1c_merged_fiona_V01.prj to ../../../../data/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/input/hybas_lev06_v1c_merged_fiona_V01.prj
download: s3://wri-projects/Aqueduct30/processDa

In [7]:
import os
if 'GDAL_DATA' not in os.environ:
    os.environ['GDAL_DATA'] = r'/usr/share/gdal/2.1'
from osgeo import gdal,ogr,osr
'GDAL_DATA' in os.environ
# If false, the GDAL_DATA variable is set incorrectly. You need this variable to obtain the spatial reference
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import time
%matplotlib notebook

In [8]:
dfFAO = pd.read_csv(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME_FAO))
dfFAO = dfFAO.set_index("PFAF_ID", drop=False)

In [9]:
dfFAO.head()

Unnamed: 0_level_0,PFAF_ID,SUB_NAME,MAJ_NAME,FAOid_copy
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111011,111011,['Wadi El Naqa'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190312']
111012,111012,['Egyptian east coast'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190313']
111013,111013,['Egyptian east coast'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190313']
111014,111014,['Egyptian east coast'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190313']
111015,111015,['Egyptian east coast'],"['Africa, Red Sea - Gulf of Aden Coast']",['MAJ_BAS_7019_SUB_BASE_0190313']


In [10]:
dfDownstream = pd.read_csv(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME_DOWNSTREAM))
dfDownstream = dfDownstream.set_index("PFAF_ID", drop=False)

In [11]:
dfDownstream.head()

Unnamed: 0_level_0,HYBAS_ID2,Unnamed: 0,HYBAS_ID,NEXT_DOWN,NEXT_SINK,MAIN_BAS,DIST_SINK,DIST_MAIN,SUB_AREA,UP_AREA,...,COAST,ORDER,SORT,Upstream_HYBAS_IDs,Upstream_PFAF_IDs,Downstream_HYBAS_IDs,Downstream_PFAF_IDs,NEXT_SINK_PFAF,Basin_HYBAS_IDs,Basin_PFAF_IDs
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
611001,6060000010,0,6060000010,0,6060000010,6060000010,0.0,0.0,4317.4,4317.4,...,1,0,1,[],[],[],[],611001.0,[6060000010],[611001]
611002,6060000200,1,6060000200,0,6060000200,6060000200,0.0,0.0,35995.5,35996.7,...,0,1,2,[],[],[],[],611002.0,[6060000200],[611002]
611003,6060000210,2,6060000210,0,6060000210,6060000210,0.0,0.0,443.9,443.9,...,1,0,3,[],[],[],[],611003.0,[6060000210],[611003]
611004,6060000240,3,6060000240,0,6060000240,6060000240,0.0,0.0,2186.3,2186.3,...,0,1,4,[],[],[],[],611004.0,[6060000240],[611004]
611005,6060000250,4,6060000250,0,6060000250,6060000250,0.0,0.0,6533.8,6533.8,...,1,0,5,[],[],[],[],611005.0,[6060000250],[611005]


In [12]:
dfDownstream.drop("Unnamed: 0",1,inplace=True)

In [13]:
gdfHybas = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME_HYBAS))
gdfHybas = gdfHybas.set_index("PFAF_ID", drop=False)

In [14]:
dfHybas = pd.DataFrame(gdfHybas["PFAF_ID"])

Merging the the downstream and FAO datasets, adding Hybas geometry and export both Excel sheet and dataset.

In [15]:
dfOut = dfDownstream.merge(dfFAO,how="outer")

In [16]:
dfOut = dfOut.set_index("PFAF_ID",drop=False)

In [17]:
gdfHybas.dtypes

HYBAS_ID       int64
NEXT_DOWN      int64
NEXT_SINK      int64
MAIN_BAS       int64
DIST_SINK    float64
DIST_MAIN    float64
SUB_AREA     float64
UP_AREA      float64
PFAF_ID        int64
ENDO           int64
COAST          int64
ORDER          int64
SORT           int64
geometry      object
dtype: object

In [None]:
gdfHybasSimple = gpd.GeoDataFrame(dfHybas, geometry=gdfHybas.geometry)

In [None]:
gdfHybasSimple.to_file(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILE_NAME+".shp"))

In [None]:
dfOut.to_csv(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILE_NAME+".csv"))

In [None]:
dfOut.to_pickle(os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILE_NAME+".pkl"))

In [None]:
!aws s3 cp {EC2_OUTPUT_PATH} {S3_OUTPUT_PATH} --recursive

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)