Skip to content

Commit

Permalink
Added read_xml method, moved validate xml to its proper method. Added…
Browse files Browse the repository at this point in the history
… submission type compatibility and exception on error message.

Signed-off-by: javier.hernandez <javier.hernandez@meaningfuldata.eu>
  • Loading branch information
javihern98 committed May 17, 2024
1 parent a1ea23a commit 0c83742
Show file tree
Hide file tree
Showing 13 changed files with 533 additions and 6 deletions.
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ python = "^3.9"
httpx = {version = "0.*", optional = true}
msgspec = "0.*"
lxml = {version = "5.*", optional = true}
xmltodict = {version = "0.*", optional = true}

[tool.poetry.extras]
fmr = ["httpx"]
xml = ["lxml"]
xml = ["lxml", "xmltodict"]

[tool.poetry.group.dev.dependencies]
darglint = "^1.8.1"
Expand All @@ -47,6 +48,8 @@ pytest-cov = "^4.0.0"
respx = "^0.20.2"
pyroma = "^4.2"
lxml-stubs = "^0.5.1"
types-xmltodict = "^0.13.0.3"


[tool.poetry.group.docs.dependencies]
sphinx = "^7.2.6"
Expand Down
3 changes: 0 additions & 3 deletions src/pysdmx/io/input_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,13 @@
from pathlib import Path
from typing import Tuple, Union

from pysdmx.io.xml.sdmx21.doc_validation import validate_doc


def __remove_bom(input_string: str) -> str:
return input_string.replace("\ufeff", "")


def __check_xml(infile: str) -> bool:
if infile[:5] == "<?xml":
validate_doc(infile)
return True

return False
Expand Down
16 changes: 16 additions & 0 deletions src/pysdmx/io/xml/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Enumeration for the XML message types."""

from enum import Enum


class MessageType(Enum):
"""MessageType enumeration.
Enumeration that withholds the Message type for writing purposes.
"""

GenericDataSet = 1
StructureSpecificDataSet = 2
Structure = 3
Error = 4
Submission = 5
203 changes: 203 additions & 0 deletions src/pysdmx/io/xml/sdmx21/__parsing_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
"""Parsing configuration for SDMX-ML 2.1 messages."""

SCHEMA_ROOT = "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/"
NAMESPACES_21 = {
SCHEMA_ROOT + "message": None,
SCHEMA_ROOT + "common": None,
SCHEMA_ROOT + "structure": None,
"http://www.w3.org/2001/XMLSchema-instance": "xsi",
"http://www.w3.org/XML/1998/namespace": None,
SCHEMA_ROOT + "data/structurespecific": None,
SCHEMA_ROOT + "data/generic": None,
SCHEMA_ROOT + "registry": None,
"http://schemas.xmlsoap.org/soap/envelope/": None,
}

XML_OPTIONS = {
"process_namespaces": True,
"namespaces": NAMESPACES_21,
"dict_constructor": dict,
"attr_prefix": "",
}

# Common
HEADER = "Header"
DATASET = "DataSet"
SERIES = "Series"
OBS = "Obs"
AGENCY_ID = "agencyID"
ID = "id"
VERSION = "version"

# Structure Specific
STRSPE = "StructureSpecificData"
GENERIC = "GenericData"
SERIESKEY = "SeriesKey"
ATTRIBUTES = "Attributes"
VALUE = "Value"
OBS_DIM = "ObsDimension"
OBSVALUE = "ObsValue"
OBSKEY = "ObsKey"
# Header
STRREF = "structureRef"
STRUCTURE = "Structure"
STR_USAGE = "StructureUsage"
STRID = "structureID"
STRTYPE = "structure_type"
DIM_OBS = "dimensionAtObservation"
ALL_DIM = "AllDimensions"
REF = "Ref"
DATASET_ID = "DataSetID"

# SDMX Error handling
ERROR = "Error"
ERROR_MESSAGE = "ErrorMessage"
ERROR_CODE = "code"
ERROR_TEXT = "Text"

# SDMX Registry Interface handling
REG_INTERFACE = "RegistryInterface"
SUBMIT_STRUCTURE_RESPONSE = "SubmitStructureResponse"
SUBMISSION_RESULT = "SubmissionResult"
SUBMITTED_STRUCTURE = "SubmittedStructure"
MAINTAINABLE_OBJECT = "MaintainableObject"
ACTION = "action"
STATUS_MSG = "StatusMessage"
STATUS = "status"

# SOAP API handling
FAULT = "Fault"
FAULTCODE = "faultcode"
FAULTSTRING = "faultstring"


# Structures
# Common
NAME = "Name"
DESC = "Description"
LANG = "lang"
XML_TEXT = "#text"
STR_URL = "structureURL"
STR_URL_LOW = "structureUrl"
SER_URL = "serviceURL"
SER_URL_LOW = "serviceUrl"
# General
ANNOTATIONS = "Annotations"
STRUCTURES = "Structures"
ORGS = "OrganisationSchemes"
AGENCIES = "AgencyScheme"
CODELISTS = "Codelists"
CONCEPTS = "Concepts"
DSDS = "DataStructures"
DATAFLOWS = "Dataflows"
CONSTRAINTS = "Constraints"

# Individual
AGENCY = "Agency"
CL = "Codelist"
CODE = "Code"
CS = "ConceptScheme"
CS_LOW = "concept_scheme"
CON = "Concept"
DSD = "DataStructure"

# DSD components
DSD_COMPS = "DataStructureComponents"
CON_ID = "ConceptIdentity"
CON_ID_LOW = "concept_identity"
CON_ROLE = "ConceptRole"
CON_ROLE_LOW = "concept_role"
# Dimension
DIM_LIST = "DimensionList"
DIM_LIST_LOW = "dimension_list"
DIM = "Dimension"
TIME_DIM = "TimeDimension"
# Attribute
ATT_LIST = "AttributeList"
ATT_LIST_LOW = "attribute_list"
ATT = "Attribute"
ATT_REL = "AttributeRelationship"
AS_STATUS = "assignmentStatus"
# Measure
ME_LIST = "MeasureList"
ME_LIST_LOW = "measure_list"
MEASURE = "Measure"
PRIM_MEASURE = "PrimaryMeasure"
# Group Dimension
GROUP = "Group"
GROUP_DIM_LOW = "group_dimension_descriptor"
GROUP_DIM = "GroupDimension"
DIM_REF = "DimensionReference"

# Dataflows
DF = "Dataflow"

# Constraints
CON_CONS = "ContentConstraint"
CONS_ATT = "ConstraintAttachment"
CUBE_REGION = "CubeRegion"
CONTENT_REGION = "dataContentRegion"
KEY = "Key"
KEY_VALUE = "KeyValue"
DATA_KEY_SET = "DataKeySet"
DATA_KEY_SET_LOW = "dataKeySet"
INCLUDED = "isIncluded"
INCLUDE = "include"

# Annotation
ANNOTATION = "Annotation"
ANNOTATION_TITLE = "AnnotationTitle"
ANNOTATION_TYPE = "AnnotationType"
ANNOTATION_TEXT = "AnnotationText"
ANNOTATION_URL = "AnnotationURL"

TITLE = "title"
TYPE_ = "type_"
TYPE = "type"
TEXT = "text"
URL = "url"
URN = "URN"

# Representation
CORE_REP = "CoreRepresentation"
CORE_REP_LOW = "core_representation"
LOCAL_REP = "LocalRepresentation"
LOCAL_REP_LOW = "local_representation"
ENUM = "Enumeration"
ENUM_FORMAT = "EnumerationFormat"
TEXT_FORMAT = "TextFormat"

# Facets
FACETS = "facets"
TEXT_TYPE = "textType"
TEXT_TYPE_LOW = "text_type"

# Contact
CONTACT = "Contact"
DEPARTMENT = "Department"
ROLE = "Role"
URI = "URI"
EMAIL = "Email"
X400 = "X400"
TELEPHONE = "Telephone"
FAX = "Fax"

# Extras
MAINTAINER = "maintainer"
XMLNS = "xmlns"
COMPS = "components"
PARENT = "Parent"
PAR_ID = "maintainableParentID"
PAR_VER = "maintainableParentVersion"
REL_TO = "relatedTo"
NO_REL = "NoSpecifiedRelationship"

# To exclude from attached_attributes
exc_attributes = [STRREF, "action", "dataScope", "xsi:type", SERIES, OBS]

# Content types
DATASTRUCTURES_CM = "DataStructures"
DATAFLOWS_CM = "Dataflows"
CODELISTS_CM = "Codelists"
CONCEPTS_CM = "Concepts"
ORGANISATIONSCHEMES_CM = "OrganisationSchemes"
98 changes: 98 additions & 0 deletions src/pysdmx/io/xml/sdmx21/reader/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,99 @@
"""SDMX 2.1 reader package."""

from typing import Any, Dict, Optional

import xmltodict

from pysdmx.errors import ClientError
from pysdmx.io.xml.enums import MessageType
from pysdmx.io.xml.sdmx21.__parsing_config import (
ERROR,
ERROR_CODE,
ERROR_MESSAGE,
ERROR_TEXT,
GENERIC,
REG_INTERFACE,
STRSPE,
STRUCTURE,
XML_OPTIONS,
)
from pysdmx.io.xml.sdmx21.doc_validation import validate_doc
from pysdmx.io.xml.sdmx21.reader.submission_reader import (
handle_registry_interface,
)

MODES = {
MessageType.GenericDataSet.value: GENERIC,
MessageType.StructureSpecificDataSet.value: STRSPE,
MessageType.Structure.value: STRUCTURE,
MessageType.Submission.value: REG_INTERFACE,
MessageType.Error.value: ERROR,
}


def read_xml(
infile: str,
validate: bool = True,
mode: Optional[MessageType] = None,
use_dataset_id: bool = False,
) -> Dict[str, Any]:
"""Reads an SDMX-ML file and returns a dictionary with the parsed data.
Args:
infile: Path to file, URL, or string.
validate: If True, the XML data will be validated against the XSD.
mode: The type of message to parse.
use_dataset_id: If True, the dataset ID will be used as the key in the
resulting dictionary.
Returns:
dict: Dictionary with the parsed data.
Raises:
ValueError: If the SDMX data cannot be parsed.
"""
if validate:
validate_doc(infile)
dict_info = xmltodict.parse(
infile, **XML_OPTIONS # type: ignore[arg-type]
)

del infile

if mode is not None and MODES[mode.value] not in dict_info:
raise ValueError(
f"Unable to parse sdmx file as {MODES[mode.value]} file"
)

result = __generate_sdmx_objects_from_xml(dict_info, use_dataset_id)

return result


def __generate_sdmx_objects_from_xml(
dict_info: Dict[str, Any], use_dataset_id: bool = False
) -> Dict[str, Any]:
"""Generates SDMX objects from the XML dictionary (xmltodict).
Args:
dict_info: XML dictionary (xmltodict)
use_dataset_id: Use the dataset ID as the key in
the resulting dictionary
Returns:
dict: Dictionary with the parsed data.
Raises:
ClientError: If a SOAP error message is found.
ValueError: If the SDMX data cannot be parsed.
"""
if ERROR in dict_info:
code = dict_info[ERROR][ERROR_MESSAGE][ERROR_CODE]
text = dict_info[ERROR][ERROR_MESSAGE][ERROR_TEXT]
raise ClientError(int(code), text)
# Leaving this commented for metadata read (#39)
# if STRUCTURE in dict_info:
# return create_structures(dict_info[STRUCTURE][STRUCTURES])
if REG_INTERFACE in dict_info:
return handle_registry_interface(dict_info)
raise ValueError("Cannot parse this sdmx data")
39 changes: 39 additions & 0 deletions src/pysdmx/io/xml/sdmx21/reader/submission_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Read SDMX-ML submission messages."""

from typing import Any, Dict

from pysdmx.io.xml.sdmx21.__parsing_config import (
ACTION,
MAINTAINABLE_OBJECT,
REG_INTERFACE,
STATUS,
STATUS_MSG,
SUBMISSION_RESULT,
SUBMIT_STRUCTURE_RESPONSE,
SUBMITTED_STRUCTURE,
URN,
)
from pysdmx.model.submission import SubmissionResult
from pysdmx.util import parse_urn


def handle_registry_interface(dict_info: Dict[str, Any]) -> Dict[str, Any]:
"""Handle the Registry Interface message.
Args:
dict_info: Dictionary with the parsed data.
Returns:
dict: Dictionary with the parsed data.
"""
response = dict_info[REG_INTERFACE][SUBMIT_STRUCTURE_RESPONSE]

result = {}
for submission_result in response[SUBMISSION_RESULT]:
structure = submission_result[SUBMITTED_STRUCTURE]
action = structure[ACTION]
urn = structure[MAINTAINABLE_OBJECT][URN]
full_id = parse_urn(urn).full_id
status = submission_result[STATUS_MSG][STATUS]
result[full_id] = SubmissionResult(action, full_id, status)
return result
Loading

0 comments on commit 0c83742

Please sign in to comment.