# Update PVPC parsing engine

Old version of the XML file parsing was implemented with `BeautifulSoup`, which is far too powerful and complex for the task at hand. 

Also, and because of this dependency, it can be seen as a _web scraper_, which are now discouraged of being included as new integrations in **HomeAssistant Core**.


### Objective

Make a simpler parser for the xml files downloaded from `https://api.esios.ree.es/archives/80/download?date=`, removing `bs4` dependency and using **`xmltodict`**, which is already in use in some HA integrations.


#### Dependencies for this notebook

- `requests`
- `dateutil`
- `xmltodict` (new xml parser)
- `bs4` (for the old parser)

In [1]:
from datetime import date
from typing import List, Tuple

from dateutil.parser import parse
from pytz import timezone

import requests
import xmltodict

TZ = timezone("Europe/Madrid")
_RESOURCE = "https://api.esios.ree.es/archives/80/download?date={day:%Y-%m-%d}"

# Get a file sample with today prices
xml_data = requests.get(_RESOURCE.format(day=date.today())).text
print(xml_data[:600])

<PVPCDesgloseHorario xmlns="http://sujetos.esios.ree.es/schemas/2014/04/01/PVPCDesgloseHorario-esios-MP/">
<IdentificacionMensaje v="pvpcdesglosehorario_20200221"/>
<VersionMensaje v="1"/>
<TipoMensaje v="Z55"/>
<TipoProceso v="A01"/>
<TipoClasificacion v="A01"/>
<IdentificacionRemitente codificacion="A01" v="10XES-REE------E"/>
<FuncionRemitente v="A04"/>
<IdentificacionDestinatario codificacion="A01" v="10XES-REE------E"/>
<FuncionDestinatario v="A04"/>
<FechaHoraMensaje v="2020-02-20T19:50:32Z"/>                        
<Horizonte v="2020-02-20T23:00Z/2020-02-21T23:00Z"/>
<SeriesTemporales>


In [2]:
def extract_prices_for_tariff(
    xml_data: str, tz: timezone = TZ, tariff: int = 2
) -> Tuple[date, List[float]]:
    """
    PVPC xml data extractor.
    
    Extract hourly prices for the selected tariff from the xml daily file download
    of the official _Spain Electric Network_ (Red Eléctrica Española, REE) 
    for the _Voluntary Price for Small Consumers_ 
    (Precio Voluntario para el Pequeño Consumidor, PVPC).
    """
    data = xmltodict.parse(xml_data)['PVPCDesgloseHorario']
    
    str_horiz = data['Horizonte']['@v']
    day: date = parse(str_horiz.split("/")[0]).astimezone(TZ).date()
    
    tariff_id = f"Z0{tariff}"
    prices = next(
        filter(
            lambda x: (
                x['TerminoCosteHorario']['@v'] == "FEU" 
                and x['TipoPrecio']['@v'] == tariff_id
            ), 
            data['SeriesTemporales'],
        )
    )
    price_values = [
        round(float(pair["Ctd"]["@v"]), 5) for pair in prices['Periodo']["Intervalo"]
    ]
    return day, price_values


day_prices = extract_prices_for_tariff(xml_data, TZ, tariff=2)
day_prices

(datetime.date(2020, 2, 21),
 [0.0466,
  0.04465,
  0.04352,
  0.04384,
  0.0439,
  0.04544,
  0.04956,
  0.05336,
  0.05259,
  0.05307,
  0.05224,
  0.05208,
  0.11715,
  0.11673,
  0.1163,
  0.11633,
  0.11652,
  0.11908,
  0.12304,
  0.12859,
  0.13041,
  0.12941,
  0.05938,
  0.05647])

In [3]:
# OLD version
from bs4 import BeautifulSoup as Soup


def scrap_xml_official_pvpc_daily_prices(
    html_text: str, tz: timezone = TZ, tariff: int = 2
) -> Tuple[date, List[float]]:
    """
    Scrape XML file content to extract hourly prices for the selected tariff

    Using `bs4` with 'html5lib' parser
    """
    ident_tarifa = "Z0{}".format(tariff)
    ident_precio = "FEU"

    soup_pvpc = Soup(html_text, "html5lib")
    str_horiz = soup_pvpc.find_all("horizonte")[0]["v"]
    ts_st = parse(str_horiz.split("/")[0]).astimezone(tz).date()
    for serie in soup_pvpc.find_all("identificacionseriestemporales"):
        columna = serie.find_next("terminocostehorario")["v"]
        if (
            columna == ident_precio
            and serie.tipoprecio["v"] == ident_tarifa
            and len(serie.find_all("tipoprecio")) > 0
        ):
            values = [round(float(v["v"]), 5) for v in serie.find_all("ctd")]
            return ts_st, values
    return ts_st, []


day_prices_old = scrap_xml_official_pvpc_daily_prices(xml_data, TZ, tariff=2)
day_prices_old

(datetime.date(2020, 2, 21),
 [0.0466,
  0.04465,
  0.04352,
  0.04384,
  0.0439,
  0.04544,
  0.04956,
  0.05336,
  0.05259,
  0.05307,
  0.05224,
  0.05208,
  0.11715,
  0.11673,
  0.1163,
  0.11633,
  0.11652,
  0.11908,
  0.12304,
  0.12859,
  0.13041,
  0.12941,
  0.05938,
  0.05647])

In [4]:
# Comparison
assert day_prices == day_prices_old
%timeit extract_prices_for_tariff(xml_data)
%timeit scrap_xml_official_pvpc_daily_prices(xml_data)

5.87 ms ± 52.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
68.7 ms ± 593 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
