# Spanish Inventory of Dams and Reservoirs
***

**_Author:_** Chus Casado Rodríguez<br>
**_Date:_** 10-04-2024<br>

**Introduction:**<br>
This notebook downloads the reports from the Spanish Inventory of Dams and Reservoirs (_Inventario de Presas y Embalses de España_, IPEE), and them loads the reports, treats them and export them as CSV files. The process is repeated for reservoirs and dams, as they have different reports in the inventory.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
from pathlib import Path
import sys
sys.path.append('../../src/')
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import requests
import re

from lisfloodreservoirs.utils.SNCZI import reservoir_attributes, dam_attributes
from lisfloodreservoirs.utils.names import correct_names

In [522]:
# def remove_accents(string: str) -> str:
    
#     string = re.sub(r'[ÁáÀà]', 'A', string)
#     string = re.sub(r'[Éé]', 'E', string)
#     string = re.sub(r'[Íí]', 'I', string)
#     string = re.sub(r'[Óó]', 'O', string)
#     string = re.sub(r'[Úú]', 'U', string)

#     return string

# def swap_words(string: str, split_pattern: str = '. ') -> str:
    
#     words = string.split(split_pattern)
#     if len(words) == 2:
#         return ' '.join([word.strip() for word in words[::-1]])
#     else:
#         return string

# def arabic_to_roman(match):
#     arabic = int(match.group(0))
#     roman_numerals = {
#         1: 'I', 4: 'IV', 5: 'V', 9: 'IX', 10: 'X', 40: 'XL',
#         50: 'L', 90: 'XC', 100: 'C', 400: 'CD', 500: 'D', 900: 'CM', 1000: 'M'
#     }
#     result = ''
#     for value, numeral in sorted(roman_numerals.items(), key=lambda x: -x[0]):
#         while arabic >= value:
#             result += numeral
#             arabic -= value
#     return result

# def correct_names(df: pd.DataFrame, col_pattern: str = 'nombre', split_pattern: str = ', ') -> pd.DataFrame:
    
#     col_names = [col for col in df if col_pattern in col.lower()]
#     for col in col_names:
#         # replace missing values
#         df[col] = df[col].replace(np.nan, '')
#         # remove accents
#         df[col] = df[col].apply(remove_accents)
#         # swap articles
#         df[col] = df[col].apply(swap_words, split_pattern=split_pattern)
#         # convert arabic numbers to roman numbers
#         df[col] = df[col].apply(lambda x: re.sub(r'\b\d+\b', arabic_to_roman, x))
        
#     return df

## Configuration

In [2]:
# path where the data is stored
PATH_DATASETS = Path('Z:/nahaUsers/casadje/datasets/')
PATH_CEDEX = PATH_DATASETS / 'CEDEX' / 'processed' / 'reservoirs' / 'attributes' / 'GIS'

## Reservoir attributes

In [3]:
# shapefile of reservoirs in the Spanish inventory
reservoirs_SNCZI = gpd.read_file(PATH_CEDEX / 'egis_embalse_geoetrs89.shp')
reservoirs_SNCZI['ID_EMBALSE'] = reservoirs_SNCZI['ID_EMBALSE'].astype(int)
reservoirs_SNCZI.set_index('ID_EMBALSE', drop=True, inplace=True)
reservoirs_SNCZI.sort_index(axis=0, inplace=True)
reservoirs_SNCZI.to_crs(epsg=25830, inplace=True)

print('SNCZI contains {0} reservoirs and {1} attributes'.format(*reservoirs_SNCZI.shape))
# reservoirs_SNCZI.head()

SNCZI contains 3170 reservoirs and 18 attributes


### Download reports

In [4]:
path_reports_res = PATH_DATASETS / 'SNCZI' / 'reports' / 'reservoirs'
path_reports_res.mkdir(parents=True, exist_ok=True)
for ID in tqdm(reservoirs_SNCZI.index):

    # output XML file
    filename = f'{path_reports_res}/{ID:04}.xml'
    if os.path.isfile(filename):
        continue
    
    # extract data from URL
    url = f'https://sig.mapama.gob.es/WebServices/clientews/snczi/Default.aspx?nombre=EGISPE_EMBALSE&claves=ID_EMBALSE&valores={ID}&op=ExpMultiple'
    
    with requests.get(url) as response:
        lines = [line.decode('utf-8') for line in response.iter_lines()]
    
    # export XML file
    with open(filename, 'w', encoding='utf-8') as file:
        file.writelines(line + '\n' for line in lines)

  0%|          | 0/3170 [00:00<?, ?it/s]

### Read reports

In [26]:
# load individal XML for each reservoir
cedex_res = pd.DataFrame(dtype='object')
for file in tqdm(list(path_reports_res.glob('*.xml'))):
    ID = int(file.stem)
    try:
        cedex_res = pd.concat((cedex_res, reservoir_attributes(str(file), name=ID)), axis=1)
    except:
        print(f'File {file} could not be read')
cedex_res = cedex_res.transpose()
cedex_res.index.name = 'res_ID'

# rename field 'inf_ID' (infrastructure ID)
# this field is the connection between the reservoir and dam attributes
# each reservoir references its main dam using the 'inf_ID' field
cedex_res.rename(columns={'Código de infraestructura': 'inf_ID'}, inplace=True)
cedex_res.inf_ID = cedex_res.inf_ID.astype('Int64')
cedex_res.drop('Código del embalse', axis=1, inplace=True)

# correct names
cedex_res = correct_names(cedex_res, col_pattern='nombre', split_pattern= '. ')

  0%|          | 0/3170 [00:00<?, ?it/s]

### Export

In [27]:
# reorder columns
cols = cedex_res.columns.tolist()
cols.remove('inf_ID')
cols = ['inf_ID'] + cols
cedex_res = cedex_res[cols]

# export
cedex_res.to_csv(path_reports_res / 'attributes_reservoirs.csv')

## Dam attributes

In [16]:
# shapefile of dams in the Spanish inventory
dams_SNCZI = gpd.read_file(PATH_CEDEX / 'egis_presa_geoetrs89.shp')
cols_int = ['ID_INFRAES', 'CODIGO']
dams_SNCZI[cols_int] = dams_SNCZI[cols_int].astype(int)
dams_SNCZI.set_index('ID_INFRAES', drop=True, inplace=True)
dams_SNCZI.sort_index(inplace=True)
dams_SNCZI.to_crs(epsg=25830, inplace=True)

print('SNCZI contains {0} dams and {1} attributes'.format(*dams_SNCZI.shape))
# dams_SNCZI.head()

SNCZI contains 3208 dams and 23 attributes


### Download reports

In [17]:
path_reports_dam = PATH_DATASETS / 'SNCZI' / 'reports' / 'dams'
path_reports_dam.mkdir(parents=True, exist_ok=True)
for id_infr in tqdm(dams_SNCZI.index):

    # output XML file
    code = dams_SNCZI.loc[id_infr, 'CODIGO']
    filename = f'{path_reports_dam}/{code:07}.xml'
    if os.path.isfile(filename):
        continue
    
    # extract data from URL
    url = f'https://sig.mapama.gob.es/WebServices/clientews/snczi/Default.aspx?nombre=EGISPE_PRESA&claves=ID_INFRAESTRUCTURA&valores={id_infr}&op=Exportar'
    
    with requests.get(url) as response:
        lines = [line.decode('utf-8') for line in response.iter_lines()]
    
    # export XML file
    with open(filename, 'w', encoding='utf-8') as file:
        file.writelines(line + '\n' for line in lines)

  0%|          | 0/3208 [00:00<?, ?it/s]

### Read reports

In [23]:
# load individal XML for each dam
cedex_dam = pd.DataFrame(dtype='object')
for file in tqdm(list(path_reports_dam.glob('*.xml'))):
    inf_ID = int(file.stem)
    try:
        cedex_dam = pd.concat((cedex_dam, dam_attributes(str(file), name=inf_ID)), axis=1)
    except:
        print(f'File {file} could not be read')
cedex_dam = cedex_dam.transpose()
cedex_dam.index.name = 'inf_ID'

# correct names
cedex_dam = correct_names(cedex_dam, col_pattern='nombre', split_pattern='. ')

  0%|          | 0/3229 [00:00<?, ?it/s]

File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\1330033.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\1390018.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2090035.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2240051.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2340021.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2340022.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2420011.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\2490027.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\4450005.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\5140029.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\5230010.xml could not be read
File Z:\nahaUsers\casadje\datasets\SNCZI\reports\dams\8460057.xml could not 

Some dams have more than one type of spillway, so the fields related to the spillway have several values separated by semicolon. Here I will identify these dams, separate the values and value representative of all the types of reservoir.

In [24]:
# regulation = []
# for value in cedex_dam.Regulación.astype(str):
#     regulation += [x.strip() for x in value.split(';')]
# set(regulation)

map_regulation = {'': np.nan,
                  'Compuerta Taintor': 1,
                  'Compuerta de sector': 1,
                  'Compuerta de segmento': 1,
                  'Compuerta vertical': 1,
                  'Compuertas': 1,
                  'Compuertas clapetas': 1,
                  'No. labio fijo': 0,
                 }

# correct values in dams with more than one type of spillway
cols = ['Número total de aliviaderos en la presa', 'Regulación', 'Capacidad a NAE (m3/s)']
idx = cedex_dam[cols[0]].astype(str).str.contains(';')
for ID, row in cedex_dam.loc[idx, cols].iterrows():
    
    # split spillway attributes
    row = row.str.replace('---', '')
    row = row.str.split(';')
    
    # correct number and capacity of the spillways
    number = np.array([int(n) for n in row.iloc[0]])
    capacity = np.array([np.nan if n in [' ', ''] else float(n) for n in row.iloc[2]])
    cedex_dam.loc[ID, cols[0]] = np.nansum(number)
    cedex_dam.loc[ID, cols[2]] = np.nansum(number * capacity)
    
    # correct regulation
    cedex_dam.loc[ID, cols[1]] = np.nanmax([map_regulation[x.strip()] for x in row.iloc[1]])

# correct regulation
cedex_dam.replace(map_regulation, inplace=True)

### Export

In [25]:
# export
cedex_dam.to_csv(path_reports_dam / 'attributes_dams.csv')