# Download data
***

***Author:** Chus Casado Rodríguez*<br>
***Date:** 10-10-2024*<br>

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import urllib.parse
from tqdm.auto import tqdm
from pathlib import Path

### Configuration

In [2]:
SYSTEM = 'Nordeste'
URL = 'https://www.ana.gov.br/sar0/Medicao'
START = datetime(1980, 1, 1)
END = datetime.now().date()
PATH = Path('Z:/nahaUsers/casadje/datasets/reservoirs/ResOpsBR/raw/')

### List of reservoirs

In [3]:
response = requests.get(URL)
if response.ok:
    # parse HTML response
    soup = BeautifulSoup(response.text, 'html.parser')

    # find list of reservoirs
    reservoir_list = soup.find('select', {'name': 'dropDownListReservatorios'})
    reservoirs = {}
    for option in reservoir_list.find_all('option'):
        try:
            reservoirs[int(option['value'])] = option.text.strip()
        except:
            continue
    print('{0} reservoirs'.format(len(reservoirs)))

# convert to DataFrame
reservoirs = pd.DataFrame(pd.Series(reservoirs))
reservoirs.index.name = 'ID'
reservoirs.columns = ['name']

542 reservoirs


### Time series

In [None]:
PATH_TS = PATH / 'timeseries' / SYSTEM
PATH_TS.mkdir(parents=True, exist_ok=True)

rename_cols = {
    'Estado': 'state',
    'Reservatório': 'name',
    'Capacidade (hm³)': 'capacity_mcm',
    'Cota (m)': 'level_m',
    'Volume (hm³)': 'volume_mcm',
    'Volume (%)': 'volume_pct',
    'Data da Medição': 'date'  
}

start_url = urllib.parse.quote(START.strftime('%d/%m/%Y'), safe='')
end_url = urllib.parse.quote(END.strftime('%d/%m/%Y'), safe='')

n_reservoirs = reservoirs.shape[0]
for ID in tqdm(reservoirs.index, total=n_reservoirs):
    
    # request info from the URL
    url_reservoir = f'{URL}?dropDownListEstados=&dropDownListReservatorios={ID}&dataInicial={start_url}&dataFinal={end_url}&button=Buscar'
    response = requests.get(url_reservoir)
    
    if response.ok:
        # parse HTML response
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # convert to pandas.DataFrame
        data = pd.read_html(str(soup.find('table')))[0]
        
        # translate columns
        data.rename(columns=rename_cols, inplace=True)
        
        # assign attributes to its table
        for col in ['state', 'capacity_mcm']:
            reservoirs.loc[ID, col] = data[col].unique()
        
        # remove redundant info
        data.drop(['name', 'state', 'capacity_mcm'], axis=1, inplace=True)

        # set a date index
        data.date = pd.to_datetime(data.date, format='%d/%m/%Y')
        data.set_index(data.date, drop=True, inplace=True)
        data.drop('date', axis=1, inplace=True, errors='ignore')
                
        # export
        data.to_csv(PATH_TS / f'{ID}.csv')
    else:
        print(f'Data for reservoir {ID} could not be retrieved: {url_reservoir}')

  0%|          | 0/542 [00:00<?, ?it/s]

In [None]:
# export
PATH_ATTRS = PATH / 'attributes' / SYSTEM
PATH_ATTRS.mkdir(parents=True, exist_ok=True)
reservoirs.to_csv(PATH_ATTRS / 'reservoirs.csv')