From 6c677c2c1a90beb399940bb130e7b802a8a22a08 Mon Sep 17 00:00:00 2001 From: "javier.hernandez" Date: Wed, 22 May 2024 16:58:42 +0200 Subject: [PATCH 1/3] Added metadata read parsers (pending Representation, DSD and DataFlow) Signed-off-by: javier.hernandez --- src/pysdmx/io/xml/sdmx21/reader/__init__.py | 1 + src/pysdmx/io/xml/sdmx21/reader/__utils.py | 135 ++++++++ .../io/xml/sdmx21/reader/metadata_read.py | 324 ++++++++++++++++++ 3 files changed, 460 insertions(+) create mode 100644 src/pysdmx/io/xml/sdmx21/reader/__init__.py create mode 100644 src/pysdmx/io/xml/sdmx21/reader/__utils.py create mode 100644 src/pysdmx/io/xml/sdmx21/reader/metadata_read.py diff --git a/src/pysdmx/io/xml/sdmx21/reader/__init__.py b/src/pysdmx/io/xml/sdmx21/reader/__init__.py new file mode 100644 index 0000000..712b116 --- /dev/null +++ b/src/pysdmx/io/xml/sdmx21/reader/__init__.py @@ -0,0 +1 @@ +"""SDMX 2.1 XML reader module.""" diff --git a/src/pysdmx/io/xml/sdmx21/reader/__utils.py b/src/pysdmx/io/xml/sdmx21/reader/__utils.py new file mode 100644 index 0000000..a592e16 --- /dev/null +++ b/src/pysdmx/io/xml/sdmx21/reader/__utils.py @@ -0,0 +1,135 @@ +"""Utility functions and constants for the parsers module.""" + +from typing import Any, Dict, List + +# Common +ID = "ID" +AGENCY_ID = "agencyID" +XMLNS = "xmlns" +VERSION = "version" + +# Structure Specific +VALUE = "Value" + +# Header +REF = "Ref" + +# Structures +# Common +NAME = "Name" +DESC = "Description" +STR_URL = "structureURL" +STR_URL_LOW = "structureUrl" +SER_URL = "serviceURL" +SER_URL_LOW = "serviceUrl" +# General +ANNOTATIONS = "Annotations" + +# Individual +CL = "Codelist" +CON = "Concept" + +# Dimension +DIM = "Dimension" + +# Measure +PRIM_MEASURE = "PrimaryMeasure" + +# Group Dimension +GROUP = "Group" +DIM_REF = "DimensionReference" + +# Constraints +KEY_VALUE = "KeyValue" + +# Annotation +ANNOTATION = "Annotation" +ANNOTATION_TITLE = "AnnotationTitle" +ANNOTATION_TYPE = "AnnotationType" +ANNOTATION_TEXT = "AnnotationText" +ANNOTATION_URL = "AnnotationURL" + +TITLE = "title" +TEXT = "text" +TYPE = "type" +URL = "url" + +# Facets +FACETS = "facets" +TEXT_TYPE = "textType" +TEXT_TYPE_LOW = "text_type" + +# Contact +CONTACT = "Contact" +DEPARTMENT = "Department" +ROLE = "Role" +URIS = "URIs" +EMAILS = "Emails" +TELEPHONES = "Telephones" +FAXES = "Faxes" +URI = "URI" +EMAIL = "Email" +X400 = "X400" +TELEPHONE = "Telephone" +FAX = "Fax" + +# Extras +AGENCY = "agency" +PAR_ID = "maintainableParentID" +PAR_VER = "maintainableParentVersion" + +# Errors +missing_rep: Dict[str, List[Any]] = {"CON": [], "CS": [], "CL": []} +dsd_id: str = "" + +# Structure types +AGENCIES = "AgencyScheme" +ORGS = "OrganisationSchemes" +CLS = "Codelists" +CONCEPTS = "ConceptSchemes" +CS = "ConceptScheme" +CODE = "Code" + +FacetType = [ + "min_length", + "max_length", + "min_value", + "max_value", + "start_value", + "end_value", + "interval", + "time_interval", + "decimals", + "pattern", + "start_time", + "end_time", + "is_sequence", +] + + +def unique_id(agencyID: str, id_: str, version: str) -> str: + """Create a unique ID for an object. + + Args: + agencyID: The agency ID + id_: The ID of the object + version: The version of the object + + Returns: + A string with the unique ID + """ + return f"{agencyID}:{id_}({version})" + + +def add_list(element: Any) -> List[Any]: + """Make sure an element is a list and convert it if it is not. + + Args: + element: The element to be converted + + Returns: + A list with the element + """ + if not isinstance(element, list): + element = [element] + return element diff --git a/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py b/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py new file mode 100644 index 0000000..2589a1a --- /dev/null +++ b/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py @@ -0,0 +1,324 @@ +"""Parsers for reading metadata.""" + +from typing import Any, Dict + +from msgspec import Struct + +from pysdmx.io.xml.sdmx21.reader.__utils import ( + add_list, + AGENCIES, + AGENCY, + AGENCY_ID, + ANNOTATION, + ANNOTATION_TEXT, + ANNOTATION_TITLE, + ANNOTATION_TYPE, + ANNOTATION_URL, + ANNOTATIONS, + CL, + CLS, + CODE, + CON, + CONTACT, + CS, + DEPARTMENT, + DESC, + EMAIL, + EMAILS, + FACETS, + FacetType, + FAX, + FAXES, + ID, + missing_rep, + NAME, + PAR_ID, + PAR_VER, + ROLE, + SER_URL, + SER_URL_LOW, + STR_URL, + STR_URL_LOW, + TELEPHONE, + TELEPHONES, + TEXT, + TEXT_TYPE, + TEXT_TYPE_LOW, + TITLE, + TYPE, + unique_id, + URI, + URIS, + URL, + VERSION, + XMLNS, +) +from pysdmx.model import Code, Codelist, Concept, ConceptScheme, Facets +from pysdmx.model.__base import Agency, Annotation, Contact, Item, ItemScheme +from pysdmx.model.message import CONCEPTS, ORGS + +SCHEMES_CLASSES = {CL: Codelist, AGENCIES: ItemScheme, CS: ConceptScheme} +ITEMS_CLASSES = {AGENCY: Agency, CODE: Code, CON: Concept} + + +class StructureParser(Struct): + """StructureParser class for SDMX-ML 2.1.""" + + agencies: Dict[str, Any] + codelists: Dict[str, Any] + concepts: Dict[str, Any] + datastructures: Dict[str, Any] + dataflows: Dict[str, Any] + + @staticmethod + def __format_contact(json_contact: Dict[str, Any]) -> Contact: + """Creates a Contact object from a json_contact. + + Args: + json_contact: The element to create the Contact object from + + Returns: + Contact object created from the json_contact + """ + xml_node_to_attribute = { + NAME: NAME.lower(), + DEPARTMENT: DEPARTMENT.lower(), + ROLE: ROLE.lower(), + URI: URIS, + EMAIL: EMAILS, + TELEPHONE: TELEPHONES, + FAX: FAXES, + } + + for k, v in xml_node_to_attribute.items(): + if k in json_contact: + json_contact[v] = add_list(json_contact.pop(k)) + + return Contact(**json_contact) + + @staticmethod + def __format_annotations(item_elem: Any) -> Dict[str, Any]: + """Formats the annotations in the item_elem. + + Args: + item_elem: The element to be formatted + + Returns: + annotations formatted + """ + if ANNOTATIONS not in item_elem: + return item_elem + annotations = [] + + ann = item_elem[ANNOTATIONS] + if ANNOTATION not in ann: + item_elem[ANNOTATIONS.lower()] = [] + del item_elem[ANNOTATIONS] + return item_elem + + ann[ANNOTATION] = add_list(ann[ANNOTATION]) + for e in ann[ANNOTATION]: + if ANNOTATION_TITLE in e: + e[TITLE] = e.pop(ANNOTATION_TITLE) + if ANNOTATION_TYPE in e: + e[TYPE] = e.pop(ANNOTATION_TYPE) + if ANNOTATION_TEXT in e: + e[TEXT] = e[ANNOTATION_TEXT] + del e[ANNOTATION_TEXT] + if ANNOTATION_URL in e: + e[URL] = e.pop(ANNOTATION_URL) + + annotations.append(Annotation(**e)) + + item_elem[ANNOTATIONS.lower()] = annotations + del item_elem[ANNOTATIONS] + + return item_elem + + @staticmethod + def __format_name_description(element: Any) -> Dict[str, Any]: + node = [NAME, DESC] + for e in node: + if e in element: + element[e.lower()] = element[e] + del element[e] + return element + + @staticmethod + def __format_facets(json_fac: Dict[str, Any]) -> Dict[str, Any]: + """Formats the facets in the json_fac. + + Args: + json_fac: The element with the facets to be formatted + + Returns: + facets formatted + """ + fac: Dict[str, Any] = {FACETS: []} + if json_fac is None: + return fac + if TEXT_TYPE in json_fac: + fac[TEXT_TYPE_LOW] = json_fac.pop(TEXT_TYPE) + for key, _value in json_fac.items(): + if key in FacetType: + facet_kwargs = { + k: v for k, v in json_fac.items() if k in FacetType + } + fac[FACETS].append(Facets(**facet_kwargs)) + + return fac + + @staticmethod + def __format_urls(json_elem: Dict[str, Any]) -> Dict[str, Any]: + """Formats the STR_URL and SER_URL keys in the element. + + Args: + json_elem: The element to be formatted + + Returns: + The json_elem with STR_URL and SER_URL keys formatted. + """ + if STR_URL in json_elem: + json_elem[STR_URL_LOW] = json_elem.pop(STR_URL) + if SER_URL in json_elem: + json_elem[SER_URL_LOW] = json_elem.pop(SER_URL) + return json_elem + + def __format_agency(self, element: Dict[str, Any]) -> Dict[str, Any]: + """Formats the AGENCY_ID key in the element to the maintainer. + + Args: + element: The element with the Agency ID to be formatted + + Returns: + element with the Agency ID formatted + """ + element[AGENCY] = self.agencies.get(element[AGENCY_ID]) + del element[AGENCY_ID] + return element + + def __format_con_id(self, json_ref: Dict[str, Any]) -> Dict[str, Any]: + """Formats the Concept key on element to have a trailing underscore. + + Args: + json_ref: The element to be formatted + + Returns: + json_ref with Concept key formatted + """ + rep = {} + full_cs_id = unique_id( + json_ref[AGENCY_ID], json_ref[PAR_ID], json_ref[PAR_VER] + ) + if full_cs_id in self.concepts: + if json_ref[ID] in self.concepts[full_cs_id]["items"]: + rep[CON] = self.concepts[full_cs_id]["items"][json_ref[ID]] + core_rep = self.concepts[full_cs_id]["items"][json_ref[ID]][ + "core_representation" + ] + if core_rep is not None: + cl = core_rep["codelist"] + if cl is not None: + rep[CL.lower()] = cl + elif json_ref[ID] not in missing_rep["CON"]: + missing_rep["CON"].append(json_ref[ID]) + + elif full_cs_id not in missing_rep["CS"]: + missing_rep["CS"].append(full_cs_id) + + return rep + + def __format_orgs(self, json_orgs: Dict[str, Any]) -> Dict[str, Any]: + orgs: Dict[str, Any] = {} + if AGENCIES in json_orgs: + if len(json_orgs) == 1 and isinstance(json_orgs[AGENCIES], dict): + ag_sch = self.__format_scheme(json_orgs, AGENCIES, AGENCY) + return ag_sch + for e in json_orgs[AGENCIES]: + ag_sch = self.__format_scheme(e, AGENCIES, AGENCY) + orgs = {**orgs, **ag_sch} + return orgs + + def __format_item( + self, item_json_info: Dict[str, Any], item_name_class: str + ) -> Item: + if XMLNS in item_json_info: + del item_json_info[XMLNS] + + item_json_info = self.__format_annotations(item_json_info) + item_json_info = self.__format_name_description(item_json_info) + + if CONTACT in item_json_info and item_name_class == AGENCY: + item_json_info[CONTACT] = add_list(item_json_info[CONTACT]) + contacts = [] + for e in item_json_info[CONTACT]: + contacts.append(self.__format_contact(e)) + item_json_info[CONTACT.lower() + "s"] = contacts + del item_json_info[CONTACT] + + # if CORE_REP in item_json_info and item_name_class == CON: + # item_json_info[CORE_REP_LOW] = format_representation( + # item_json_info[CORE_REP]) + # del item_json_info[CORE_REP] + + return ITEMS_CLASSES[item_name_class](**item_json_info) + + def __format_scheme( + self, json_elem: Dict[str, Any], scheme: str, item: str + ) -> Dict[str, Any]: + elements: Dict[str, Any] = {} + if scheme not in json_elem: + return elements + + json_elem[scheme] = add_list(json_elem[scheme]) + for element in json_elem[scheme]: + + if XMLNS in element: + del element[XMLNS] + + element = self.__format_annotations(element) + element = self.__format_name_description(element) + full_id = unique_id( + element[AGENCY_ID], element[ID], element[VERSION] + ) + element = self.__format_urls(element) + element = self.__format_agency(element) + if item in element: + element[item] = add_list(element[item]) + items = [] + for item_elem in element[item]: + # Dynamic + items.append(self.__format_item(item_elem, item)) + del element[item] + element["items"] = items + if scheme == AGENCIES: + self.agencies.update({e.id: e for e in items}) + else: + element["items"] = [] + # Dynamic creation with specific class + elements[full_id] = SCHEMES_CLASSES[scheme](**element) + + return elements + + def format_structures(self, json_meta: Dict[str, Any]) -> Dict[str, Any]: + """Formats the structures in json format. + + Args: + json_meta: The structures in json format + + Returns: + A dictionary with the structures formatted + """ + # Reset dict to store metadata + structures = {} + + if ORGS in json_meta: + structures[ORGS] = self.__format_orgs(json_meta[ORGS]) + if CLS in json_meta: + structures[CLS] = self.__format_scheme(json_meta[CLS], CL, CODE) + if CONCEPTS in json_meta: + structures[CONCEPTS] = self.__format_scheme( + json_meta[CONCEPTS], CS, CON + ) + # Reset global variables + return structures From 668300fe7ddd4c6fb8c568d0c68304e41a82438b Mon Sep 17 00:00:00 2001 From: "javier.hernandez" Date: Mon, 24 Jun 2024 18:57:18 +0200 Subject: [PATCH 2/3] Added metadata readers for Item and ItemScheme. Signed-off-by: javier.hernandez --- src/pysdmx/io/xml/sdmx21/reader/__init__.py | 9 +- src/pysdmx/io/xml/sdmx21/reader/__utils.py | 52 +- .../io/xml/sdmx21/reader/metadata_read.py | 223 ++++--- src/pysdmx/model/message.py | 2 +- src/pysdmx/util/__init__.py | 14 +- .../io/xml/sdmx21/reader/samples/agencies.xml | 24 + .../xml/sdmx21/reader/samples/codelists.xml | 340 ++++++++++ .../xml/sdmx21/reader/samples/item_scheme.xml | 626 ++++++++++++++++++ tests/io/xml/sdmx21/reader/test_reader.py | 85 +++ tests/model/test_message.py | 8 +- 10 files changed, 1245 insertions(+), 138 deletions(-) create mode 100644 tests/io/xml/sdmx21/reader/samples/agencies.xml create mode 100644 tests/io/xml/sdmx21/reader/samples/codelists.xml create mode 100644 tests/io/xml/sdmx21/reader/samples/item_scheme.xml diff --git a/src/pysdmx/io/xml/sdmx21/reader/__init__.py b/src/pysdmx/io/xml/sdmx21/reader/__init__.py index 4006971..3cf44f0 100644 --- a/src/pysdmx/io/xml/sdmx21/reader/__init__.py +++ b/src/pysdmx/io/xml/sdmx21/reader/__init__.py @@ -15,9 +15,11 @@ REG_INTERFACE, STRSPE, STRUCTURE, + STRUCTURES, XML_OPTIONS, ) from pysdmx.io.xml.sdmx21.doc_validation import validate_doc +from pysdmx.io.xml.sdmx21.reader.metadata_read import StructureParser from pysdmx.io.xml.sdmx21.reader.submission_reader import ( handle_registry_interface, ) @@ -91,9 +93,10 @@ def __generate_sdmx_objects_from_xml( code = dict_info[ERROR][ERROR_MESSAGE][ERROR_CODE] text = dict_info[ERROR][ERROR_MESSAGE][ERROR_TEXT] raise ClientError(int(code), text) - # Leaving this commented for metadata read (#39) - # if STRUCTURE in dict_info: - # return create_structures(dict_info[STRUCTURE][STRUCTURES]) + if STRUCTURE in dict_info: + return StructureParser().format_structures( + dict_info[STRUCTURE][STRUCTURES] + ) if REG_INTERFACE in dict_info: return handle_registry_interface(dict_info) raise ValueError("Cannot parse this sdmx data") diff --git a/src/pysdmx/io/xml/sdmx21/reader/__utils.py b/src/pysdmx/io/xml/sdmx21/reader/__utils.py index a592e16..663516d 100644 --- a/src/pysdmx/io/xml/sdmx21/reader/__utils.py +++ b/src/pysdmx/io/xml/sdmx21/reader/__utils.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List # Common -ID = "ID" +ID = "id" AGENCY_ID = "agencyID" XMLNS = "xmlns" VERSION = "version" @@ -19,9 +19,15 @@ NAME = "Name" DESC = "Description" STR_URL = "structureURL" -STR_URL_LOW = "structureUrl" +STR_URL_LOW = "structure_url" SER_URL = "serviceURL" -SER_URL_LOW = "serviceUrl" +SER_URL_LOW = "service_url" +IS_EXTERNAL_REF = "isExternalReference" +IS_EXTERNAL_REF_LOW = "is_external_reference" +IS_FINAL = "isFinal" +IS_FINAL_LOW = "is_final" +IS_PARTIAL = "isPartial" +IS_PARTIAL_LOW = "is_partial" # General ANNOTATIONS = "Annotations" @@ -63,10 +69,10 @@ CONTACT = "Contact" DEPARTMENT = "Department" ROLE = "Role" -URIS = "URIs" -EMAILS = "Emails" -TELEPHONES = "Telephones" -FAXES = "Faxes" +URIS = "uris" +EMAILS = "emails" +TELEPHONES = "telephones" +FAXES = "faxes" URI = "URI" EMAIL = "Email" X400 = "X400" @@ -74,7 +80,7 @@ FAX = "Fax" # Extras -AGENCY = "agency" +AGENCY = "Agency" PAR_ID = "maintainableParentID" PAR_VER = "maintainableParentVersion" @@ -90,21 +96,21 @@ CS = "ConceptScheme" CODE = "Code" -FacetType = [ - "min_length", - "max_length", - "min_value", - "max_value", - "start_value", - "end_value", - "interval", - "time_interval", - "decimals", - "pattern", - "start_time", - "end_time", - "is_sequence", -] +FacetType = { + "minLength": "min_length", + "maxLength": "max_length", + "minValue": "min_value", + "maxValue": "max_value", + "startValue": "start_value", + "endValue": "end_value", + "interval": "interval", + "timeInterval": "time_interval", + "decimals": "decimals", + "pattern": "pattern", + "startTime": "start_time", + "endTime": "end_time", + "isSequence": "is_sequence", +} def unique_id(agencyID: str, id_: str, version: str) -> str: diff --git a/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py b/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py index 2589a1a..76d2f04 100644 --- a/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py +++ b/src/pysdmx/io/xml/sdmx21/reader/metadata_read.py @@ -4,6 +4,7 @@ from msgspec import Struct +from pysdmx.io.xml.sdmx21.__parsing_config import CORE_REP, URN from pysdmx.io.xml.sdmx21.reader.__utils import ( add_list, AGENCIES, @@ -30,10 +31,13 @@ FAX, FAXES, ID, - missing_rep, + IS_EXTERNAL_REF, + IS_EXTERNAL_REF_LOW, + IS_FINAL, + IS_FINAL_LOW, + IS_PARTIAL, + IS_PARTIAL_LOW, NAME, - PAR_ID, - PAR_VER, ROLE, SER_URL, SER_URL_LOW, @@ -43,7 +47,6 @@ TELEPHONES, TEXT, TEXT_TYPE, - TEXT_TYPE_LOW, TITLE, TYPE, unique_id, @@ -51,11 +54,18 @@ URIS, URL, VERSION, - XMLNS, ) -from pysdmx.model import Code, Codelist, Concept, ConceptScheme, Facets +from pysdmx.model import ( + Code, + Codelist, + Concept, + ConceptScheme, + DataType, + Facets, +) from pysdmx.model.__base import Agency, Annotation, Contact, Item, ItemScheme from pysdmx.model.message import CONCEPTS, ORGS +from pysdmx.util import find_by_urn SCHEMES_CLASSES = {CL: Codelist, AGENCIES: ItemScheme, CS: ConceptScheme} ITEMS_CLASSES = {AGENCY: Agency, CODE: Code, CON: Concept} @@ -64,14 +74,26 @@ class StructureParser(Struct): """StructureParser class for SDMX-ML 2.1.""" - agencies: Dict[str, Any] - codelists: Dict[str, Any] - concepts: Dict[str, Any] - datastructures: Dict[str, Any] - dataflows: Dict[str, Any] + agencies: Dict[str, Any] = {} + codelists: Dict[str, Any] = {} + concepts: Dict[str, Any] = {} + datastructures: Dict[str, Any] = {} + dataflows: Dict[str, Any] = {} - @staticmethod - def __format_contact(json_contact: Dict[str, Any]) -> Contact: + def __extract_text(self, element: Any) -> str: + """Extracts the text from the element. + + Args: + element: The element to extract the text from + + Returns: + The text extracted from the element + """ + if isinstance(element, dict) and "#text" in element: + element = element["#text"] + return element + + def __format_contact(self, json_contact: Dict[str, Any]) -> Contact: """Creates a Contact object from a json_contact. Args: @@ -80,6 +102,8 @@ def __format_contact(json_contact: Dict[str, Any]) -> Contact: Returns: Contact object created from the json_contact """ + self.__format_name_description(json_contact) + xml_node_to_attribute = { NAME: NAME.lower(), DEPARTMENT: DEPARTMENT.lower(), @@ -92,12 +116,17 @@ def __format_contact(json_contact: Dict[str, Any]) -> Contact: for k, v in xml_node_to_attribute.items(): if k in json_contact: - json_contact[v] = add_list(json_contact.pop(k)) + if k in [DEPARTMENT, ROLE]: + json_contact[v] = self.__extract_text(json_contact.pop(k)) + continue + field_info = add_list(json_contact.pop(k)) + for i, element in enumerate(field_info): + field_info[i] = self.__extract_text(element) + json_contact[v] = field_info return Contact(**json_contact) - @staticmethod - def __format_annotations(item_elem: Any) -> Dict[str, Any]: + def __format_annotations(self, item_elem: Any) -> Dict[str, Any]: """Formats the annotations in the item_elem. Args: @@ -111,11 +140,6 @@ def __format_annotations(item_elem: Any) -> Dict[str, Any]: annotations = [] ann = item_elem[ANNOTATIONS] - if ANNOTATION not in ann: - item_elem[ANNOTATIONS.lower()] = [] - del item_elem[ANNOTATIONS] - return item_elem - ann[ANNOTATION] = add_list(ann[ANNOTATION]) for e in ann[ANNOTATION]: if ANNOTATION_TITLE in e: @@ -123,7 +147,7 @@ def __format_annotations(item_elem: Any) -> Dict[str, Any]: if ANNOTATION_TYPE in e: e[TYPE] = e.pop(ANNOTATION_TYPE) if ANNOTATION_TEXT in e: - e[TEXT] = e[ANNOTATION_TEXT] + e[TEXT] = self.__extract_text(e[ANNOTATION_TEXT]) del e[ANNOTATION_TEXT] if ANNOTATION_URL in e: e[URL] = e.pop(ANNOTATION_URL) @@ -135,38 +159,35 @@ def __format_annotations(item_elem: Any) -> Dict[str, Any]: return item_elem - @staticmethod - def __format_name_description(element: Any) -> Dict[str, Any]: + def __format_name_description(self, element: Any) -> Dict[str, Any]: node = [NAME, DESC] - for e in node: - if e in element: - element[e.lower()] = element[e] - del element[e] + for field in node: + if field in element: + element[field.lower()] = self.__extract_text(element[field]) + del element[field] return element @staticmethod - def __format_facets(json_fac: Dict[str, Any]) -> Dict[str, Any]: - """Formats the facets in the json_fac. + def __format_facets( + json_fac: Dict[str, Any], json_obj: Dict[str, Any] + ) -> None: + """Formats the facets in the json_fac to be stored in json_obj. Args: json_fac: The element with the facets to be formatted - - Returns: - facets formatted + json_obj: The element to store the formatted facets """ - fac: Dict[str, Any] = {FACETS: []} - if json_fac is None: - return fac - if TEXT_TYPE in json_fac: - fac[TEXT_TYPE_LOW] = json_fac.pop(TEXT_TYPE) for key, _value in json_fac.items(): + if key == TEXT_TYPE and json_fac[TEXT_TYPE] in list(DataType): + json_obj["dtype"] = DataType(json_fac[TEXT_TYPE]) + if key in FacetType: facet_kwargs = { - k: v for k, v in json_fac.items() if k in FacetType + FacetType[k]: v + for k, v in json_fac.items() + if k in FacetType } - fac[FACETS].append(Facets(**facet_kwargs)) - - return fac + json_obj[FACETS] = Facets(**facet_kwargs) @staticmethod def __format_urls(json_elem: Dict[str, Any]) -> Dict[str, Any]: @@ -193,61 +214,45 @@ def __format_agency(self, element: Dict[str, Any]) -> Dict[str, Any]: Returns: element with the Agency ID formatted """ - element[AGENCY] = self.agencies.get(element[AGENCY_ID]) + element[AGENCY.lower()] = self.agencies.get( + element[AGENCY_ID], element[AGENCY_ID] + ) del element[AGENCY_ID] return element - def __format_con_id(self, json_ref: Dict[str, Any]) -> Dict[str, Any]: - """Formats the Concept key on element to have a trailing underscore. - - Args: - json_ref: The element to be formatted - - Returns: - json_ref with Concept key formatted - """ - rep = {} - full_cs_id = unique_id( - json_ref[AGENCY_ID], json_ref[PAR_ID], json_ref[PAR_VER] - ) - if full_cs_id in self.concepts: - if json_ref[ID] in self.concepts[full_cs_id]["items"]: - rep[CON] = self.concepts[full_cs_id]["items"][json_ref[ID]] - core_rep = self.concepts[full_cs_id]["items"][json_ref[ID]][ - "core_representation" - ] - if core_rep is not None: - cl = core_rep["codelist"] - if cl is not None: - rep[CL.lower()] = cl - elif json_ref[ID] not in missing_rep["CON"]: - missing_rep["CON"].append(json_ref[ID]) - - elif full_cs_id not in missing_rep["CS"]: - missing_rep["CS"].append(full_cs_id) - - return rep - def __format_orgs(self, json_orgs: Dict[str, Any]) -> Dict[str, Any]: orgs: Dict[str, Any] = {} - if AGENCIES in json_orgs: - if len(json_orgs) == 1 and isinstance(json_orgs[AGENCIES], dict): - ag_sch = self.__format_scheme(json_orgs, AGENCIES, AGENCY) - return ag_sch - for e in json_orgs[AGENCIES]: - ag_sch = self.__format_scheme(e, AGENCIES, AGENCY) - orgs = {**orgs, **ag_sch} + json_orgs = add_list(json_orgs) # type: ignore[assignment] + for e in json_orgs: + ag_sch = self.__format_scheme( + e, AGENCIES, AGENCY # type: ignore[arg-type] + ) + orgs = {**orgs, **ag_sch} return orgs + def __format_representation( + self, json_rep: Dict[str, Any], json_obj: Dict[str, Any] + ) -> None: + """Formats the representation in the json_rep.""" + if "TextFormat" in json_rep: + self.__format_facets(json_rep["TextFormat"], json_obj) + + if ( + "Enumeration" in json_rep + and URN in json_rep["Enumeration"] + and len(self.codelists) > 0 + ): + codelist = find_by_urn( + list(self.codelists.values()), + json_rep["Enumeration"][URN], + ) + json_obj["codes"] = codelist.codes + def __format_item( self, item_json_info: Dict[str, Any], item_name_class: str ) -> Item: - if XMLNS in item_json_info: - del item_json_info[XMLNS] - item_json_info = self.__format_annotations(item_json_info) item_json_info = self.__format_name_description(item_json_info) - if CONTACT in item_json_info and item_name_class == AGENCY: item_json_info[CONTACT] = add_list(item_json_info[CONTACT]) contacts = [] @@ -256,10 +261,14 @@ def __format_item( item_json_info[CONTACT.lower() + "s"] = contacts del item_json_info[CONTACT] - # if CORE_REP in item_json_info and item_name_class == CON: - # item_json_info[CORE_REP_LOW] = format_representation( - # item_json_info[CORE_REP]) - # del item_json_info[CORE_REP] + if CORE_REP in item_json_info and item_name_class == CON: + self.__format_representation( + item_json_info[CORE_REP], item_json_info + ) + del item_json_info[CORE_REP] + + if "Parent" in item_json_info: + del item_json_info["Parent"] return ITEMS_CLASSES[item_name_class](**item_json_info) @@ -267,14 +276,10 @@ def __format_scheme( self, json_elem: Dict[str, Any], scheme: str, item: str ) -> Dict[str, Any]: elements: Dict[str, Any] = {} - if scheme not in json_elem: - return elements json_elem[scheme] = add_list(json_elem[scheme]) for element in json_elem[scheme]: - - if XMLNS in element: - del element[XMLNS] + element["items"] = [] element = self.__format_annotations(element) element = self.__format_name_description(element) @@ -282,19 +287,24 @@ def __format_scheme( element[AGENCY_ID], element[ID], element[VERSION] ) element = self.__format_urls(element) + if IS_EXTERNAL_REF in element: + element[IS_EXTERNAL_REF_LOW] = element.pop(IS_EXTERNAL_REF) + if IS_FINAL in element: + element[IS_FINAL_LOW] = element.pop(IS_FINAL) + if IS_PARTIAL in element: + element[IS_PARTIAL_LOW] = element.pop(IS_PARTIAL) + element[item] = add_list(element[item]) + items = [] + for item_elem in element[item]: + # Dynamic + items.append(self.__format_item(item_elem, item)) + del element[item] + element["items"] = items + if scheme == AGENCIES: + self.agencies.update({e.id: e for e in items}) + if scheme == CS: + self.concepts.update({e.id: e for e in items}) element = self.__format_agency(element) - if item in element: - element[item] = add_list(element[item]) - items = [] - for item_elem in element[item]: - # Dynamic - items.append(self.__format_item(item_elem, item)) - del element[item] - element["items"] = items - if scheme == AGENCIES: - self.agencies.update({e.id: e for e in items}) - else: - element["items"] = [] # Dynamic creation with specific class elements[full_id] = SCHEMES_CLASSES[scheme](**element) @@ -316,6 +326,7 @@ def format_structures(self, json_meta: Dict[str, Any]) -> Dict[str, Any]: structures[ORGS] = self.__format_orgs(json_meta[ORGS]) if CLS in json_meta: structures[CLS] = self.__format_scheme(json_meta[CLS], CL, CODE) + self.codelists = structures[CLS] if CONCEPTS in json_meta: structures[CONCEPTS] = self.__format_scheme( json_meta[CONCEPTS], CS, CON diff --git a/src/pysdmx/model/message.py b/src/pysdmx/model/message.py index 296ddc7..27f7d9a 100644 --- a/src/pysdmx/model/message.py +++ b/src/pysdmx/model/message.py @@ -50,7 +50,7 @@ class Header(Struct, frozen=True, kw_only=True): ORGS = "OrganisationSchemes" CLS = "Codelists" -CONCEPTS = "ConceptSchemes" +CONCEPTS = "Concepts" MSG_CONTENT_PKG = { ORGS: ItemScheme, diff --git a/src/pysdmx/util/__init__.py b/src/pysdmx/util/__init__.py index 0f10308..6645de3 100644 --- a/src/pysdmx/util/__init__.py +++ b/src/pysdmx/util/__init__.py @@ -6,6 +6,7 @@ from msgspec import Struct from pysdmx.errors import NotFound +from pysdmx.model import Agency NF = "Not found" @@ -89,7 +90,18 @@ def find_by_urn(artefacts: Sequence[Any], urn: str) -> Any: f = [ a for a in artefacts - if (a.agency == r.agency and a.id == r.id and a.version == r.version) + if ( + ( + a.agency == r.agency + or ( + a.agency.id == r.agency + if isinstance(a.agency, Agency) + else False + ) + ) + and a.id == r.id + and a.version == r.version + ) ] if f: return f[0] diff --git a/tests/io/xml/sdmx21/reader/samples/agencies.xml b/tests/io/xml/sdmx21/reader/samples/agencies.xml new file mode 100644 index 0000000..af6718b --- /dev/null +++ b/tests/io/xml/sdmx21/reader/samples/agencies.xml @@ -0,0 +1,24 @@ + + + + SDMX_COG + false + 2010-11-13T08:00:33+08:00 + + + + + + SDMX Agency Scheme + + SDMX + + + + + \ No newline at end of file diff --git a/tests/io/xml/sdmx21/reader/samples/codelists.xml b/tests/io/xml/sdmx21/reader/samples/codelists.xml new file mode 100644 index 0000000..4149879 --- /dev/null +++ b/tests/io/xml/sdmx21/reader/samples/codelists.xml @@ -0,0 +1,340 @@ + + + + SDMX_COG + false + 2010-11-13T08:00:33+08:00 + + + + + + Code list for Decimals (DECIMALS) + It provides a list of values showing the number of decimal digits used + in the data. + + + Zero + + + One + + + Two + + + Three + + + Four + + + Five + + + Six + + + Seven + + + Eight + + + Nine + + + + Code list for Frequency (FREQ) + It provides a list of values indicating the "frequency" of the data + (e.g. monthly) and, thus, indirectly, also implying the type of "time reference" that could be used + for identifying the data with respect time. + + + + + NOTE 1 + NOTE + + https://example.com/sdmx + + It is typically used for annual data. This can also + serve cases of multi-annual data (data that appear once every two, three or, possibly, + five years). Descriptive information on the multiannual characteristics (e.g. frequency + of the series in practice and other methodological information can be provided at the + dataflow level, as long as these characteristics are applicable for the entire + dataflow). + + + + Annual + + + + + Similar to "daily", however there are no observations + for Saturday and Sunday (so, neither "missing values" nor "numeric values" should be + provided for Saturday and Sunday). This treatment ("business") is one way to deal with + such cases, but it is not the only option. Such a time series could alternatively be + considered daily ("D"), thus, with missing values in the weekend. + + + + Daily - business week + + + Daily + + + Monthly + + + + + NOTE + + + Minutely + + + Quarterly + + + Half Yearly, semester + + + Weekly + + + + code list for Confidentiality Status (CONF_STATUS) + this code list provides coded information about the sensitivity and + confidentiality status of the data. + + + + + NOTE + Confidential statistical information (primary + confidentiality) due to identifiable respondents. Measures also should be taken to + prevent not only direct access, but also indirect deduction or calculation by other + users and parties, probably by considering and treating additional observations as + "confidential" (secondary confidentiality management). + + + + Confidential statistical information + + + + + NOTE + Used by the sender of the data to flag (beyond the + confidential statistical information) one or more additional observations of the dataset + so that the receiver knows that he/she should suppress these observations in subsequent + stages of processing (especially, dissemination) in order to prevent third parties to + indirectly deduct (e.g. through accounting identities or other formulas) the + observations that are genuinely flagged with "C". + + + + Secondary confidentiality set by the sender, not for publication + + + + + NOTE + It is used for observations for observations for which + there are no special sensitivity considerations and which can be freely shared. Please + note: In some institutional environments the term "unclassified" is used in a sense that + still denotes implied restrictions in the circulation of information. If this is the + case, this organisation may probably consider that "free" (value F) is not the + appropriate corresponding tag for this kind of "unclassified" category and it may be + that "non-publishable / restricted" (value N) may be more appropriate. The focus of this + code is not to describe and anticipate all potential permissible uses (e.g. permission + to re-disseminate). Thus, the use of "F" does not allow safe assumptions with respect to + the permission to "re-disseminate" (freely or at a price) the received or accessed + information (e.g. on web or paper), especially on a massive and regular basis (legal and + copyright constraints may apply). Usually, source organisations provide information and + guidance on re-dissemination issues, either on their websites or on their paper + publications. + + + + Free + + + + + NOTE + Used to denote observations that are restricted for + internal use only within organisations. + + + + Not for publication, restricted for internal use only + + + + + NOTE + If senders do not manage the secondary confidentiality + in their data and/or there are also other countries' data involved (with the intention + to eventually compile a regional-wide aggregate that is going to be published), the + value "S" is used by the receiver to flag additional suppressed observations (within + sender's data and/or within the datasets of other senders) in subsequent stages of + processing (especially, dissemination) in order to prevent third parties to indirectly + deduct the observations that were genuinely flagged with "C" by the sender. + + + + Secondary confidentiality set and managed by the receiver, not for publication + + + + + Observation status + This code list provides coded information about the "status" of an + observation (with respect events such as the ones reflected in the codes composing the code list). + + + + + NOTE + Normal is the default value (if no value is provided) + and is used when no special coded qualification is assumed. Usually, in this case, it + can be assumed that the source agency assigns sufficient confidence to the provided + observation and/or the value is not expected to be dramatically revised. + + + + Normal + + + + + NOTE + Break observations are characterised as such when + different content exist or a different methodology has been applied to this observation + as compared with the preceding one (the one given for the previous period). + + + + Break + + + + + NOTE + Observation obtained through an estimation methodology + (e.g. to produce back-casts) or based on the use of a limited amount of data or ad hoc + sampling and through additional calculations (e.g. to produce a value at an early stage + of the production stage while not all data are available). It may also be used in case + of experimental data (e.g. in the context of a pilot ahead of a full scale production + process) or in case of data of (anticipated/assessed) low quality. If needed, additional + (uncoded) information can be provided through (free text) "comments" at the observation + level or at a higher level. + + + + Estimated value + + + Forecast value + + + + + NOTE + Observation imputed by international organisations to + replace or fill gaps in national data series, in line with the recommendations of the + Committee for the Coordination of Statistical Activities (CCSA). + + + + Imputed value (CCSA definition) + + + + + NOTE + Data can be missing due to various reasons: data do not + exist, are insignificant (or not collected because they are below a certain threshold), + are unreliable, are not relevant for the period, or other reason not elsewhere + specified. + + + + Missing value + + + + + NOTE + An observation is characterised as "provisional" when + the source agency - while it bases its calculations on its standard production + methodology - considers that the data, almost certainly, are expected to be revised. + + + + Provisional value + + + + + NOTE + A known strike that occurred in the corresponding + period that may have affected the observation or caused a missing value. + + + + Strike + + + + code list for the Unit Multiplier (UNIT_MULT) + + Units + + + Tens + + + Hundreds + + + Thousands + + + Tens of thousands + + + Millions + + + Billions + + + Trillions + + + Quadrillions + + + + + \ No newline at end of file diff --git a/tests/io/xml/sdmx21/reader/samples/item_scheme.xml b/tests/io/xml/sdmx21/reader/samples/item_scheme.xml new file mode 100644 index 0000000..99c5bc2 --- /dev/null +++ b/tests/io/xml/sdmx21/reader/samples/item_scheme.xml @@ -0,0 +1,626 @@ + + + + SDMX_COG + false + 2010-11-13T08:00:33+08:00 + + + + + + SDMX Agency Scheme + + SDMX + + + Bank for International Settlements + + + UNESCO Institute for Statistics (UIS) + The UNESCO Institute for Statistics (UIS) is the statistical office + of the United Nations Educational, Scientific and Cultural Organization (UNESCO). The Institute + produces the data and methodologies to monitor trends at national and international levels. It + delivers comparative data for countries at all stages of development to provide a global + perspective on education, science and technology, culture, and communication. The UIS is the + official source for the indicators needed to achieve SDG 4–Education 2030 and key targets in + science and innovation, culture and communication. + + + UNESCO Institute for Statistics (UIS) + UNESCO Institute for Statistics (UIS) + Single entry point for external inquiries + http://uis.unesco.org + uis.datarequests@unesco.org + + + + National Statistical Institute of Guinea-Bissau - Instituto Nacional de + Estatística da Guiné-Bissau (INE) + + + + + + + Code list for Decimals (DECIMALS) + It provides a list of values showing the number of decimal digits used + in the data. + + + Zero + + + One + + + Two + + + Three + + + Four + + + Five + + + Six + + + Seven + + + Eight + + + Nine + + + + Code list for Frequency (FREQ) + It provides a list of values indicating the "frequency" of the data + (e.g. monthly) and, thus, indirectly, also implying the type of "time reference" that could be used + for identifying the data with respect time. + + + + + NOTE 1 + NOTE + + https://example.com/sdmx + + It is typically used for annual data. This can also + serve cases of multi-annual data (data that appear once every two, three or, possibly, + five years). Descriptive information on the multiannual characteristics (e.g. frequency + of the series in practice and other methodological information can be provided at the + dataflow level, as long as these characteristics are applicable for the entire + dataflow). + + + + Annual + + + + + Similar to "daily", however there are no observations + for Saturday and Sunday (so, neither "missing values" nor "numeric values" should be + provided for Saturday and Sunday). This treatment ("business") is one way to deal with + such cases, but it is not the only option. Such a time series could alternatively be + considered daily ("D"), thus, with missing values in the weekend. + + + + Daily - business week + + + Daily + + + Monthly + + + + + NOTE + + + Minutely + + + Quarterly + + + Half Yearly, semester + + + Weekly + + + + code list for Confidentiality Status (CONF_STATUS) + this code list provides coded information about the sensitivity and + confidentiality status of the data. + + + + + NOTE + Confidential statistical information (primary + confidentiality) due to identifiable respondents. Measures also should be taken to + prevent not only direct access, but also indirect deduction or calculation by other + users and parties, probably by considering and treating additional observations as + "confidential" (secondary confidentiality management). + + + + Confidential statistical information + + + + + NOTE + Used by the sender of the data to flag (beyond the + confidential statistical information) one or more additional observations of the dataset + so that the receiver knows that he/she should suppress these observations in subsequent + stages of processing (especially, dissemination) in order to prevent third parties to + indirectly deduct (e.g. through accounting identities or other formulas) the + observations that are genuinely flagged with "C". + + + + Secondary confidentiality set by the sender, not for publication + + + + + NOTE + It is used for observations for observations for which + there are no special sensitivity considerations and which can be freely shared. Please + note: In some institutional environments the term "unclassified" is used in a sense that + still denotes implied restrictions in the circulation of information. If this is the + case, this organisation may probably consider that "free" (value F) is not the + appropriate corresponding tag for this kind of "unclassified" category and it may be + that "non-publishable / restricted" (value N) may be more appropriate. The focus of this + code is not to describe and anticipate all potential permissible uses (e.g. permission + to re-disseminate). Thus, the use of "F" does not allow safe assumptions with respect to + the permission to "re-disseminate" (freely or at a price) the received or accessed + information (e.g. on web or paper), especially on a massive and regular basis (legal and + copyright constraints may apply). Usually, source organisations provide information and + guidance on re-dissemination issues, either on their websites or on their paper + publications. + + + + Free + + + + + NOTE + Used to denote observations that are restricted for + internal use only within organisations. + + + + Not for publication, restricted for internal use only + + + + + NOTE + If senders do not manage the secondary confidentiality + in their data and/or there are also other countries' data involved (with the intention + to eventually compile a regional-wide aggregate that is going to be published), the + value "S" is used by the receiver to flag additional suppressed observations (within + sender's data and/or within the datasets of other senders) in subsequent stages of + processing (especially, dissemination) in order to prevent third parties to indirectly + deduct the observations that were genuinely flagged with "C" by the sender. + + + + Secondary confidentiality set and managed by the receiver, not for publication + + + + + Observation status + This code list provides coded information about the "status" of an + observation (with respect events such as the ones reflected in the codes composing the code list). + + + + + NOTE + Normal is the default value (if no value is provided) + and is used when no special coded qualification is assumed. Usually, in this case, it + can be assumed that the source agency assigns sufficient confidence to the provided + observation and/or the value is not expected to be dramatically revised. + + + + Normal + + + + + NOTE + Break observations are characterised as such when + different content exist or a different methodology has been applied to this observation + as compared with the preceding one (the one given for the previous period). + + + + Break + + + + + NOTE + Observation obtained through an estimation methodology + (e.g. to produce back-casts) or based on the use of a limited amount of data or ad hoc + sampling and through additional calculations (e.g. to produce a value at an early stage + of the production stage while not all data are available). It may also be used in case + of experimental data (e.g. in the context of a pilot ahead of a full scale production + process) or in case of data of (anticipated/assessed) low quality. If needed, additional + (uncoded) information can be provided through (free text) "comments" at the observation + level or at a higher level. + + + + Estimated value + + + Forecast value + + + + + NOTE + Observation imputed by international organisations to + replace or fill gaps in national data series, in line with the recommendations of the + Committee for the Coordination of Statistical Activities (CCSA). + + + + Imputed value (CCSA definition) + + + + + NOTE + Data can be missing due to various reasons: data do not + exist, are insignificant (or not collected because they are below a certain threshold), + are unreliable, are not relevant for the period, or other reason not elsewhere + specified. + + + + Missing value + + + + + NOTE + An observation is characterised as "provisional" when + the source agency - while it bases its calculations on its standard production + methodology - considers that the data, almost certainly, are expected to be revised. + + + + Provisional value + + + + + NOTE + A known strike that occurred in the corresponding + period that may have affected the observation or caused a missing value. + + + + Strike + + + + code list for the Unit Multiplier (UNIT_MULT) + + Units + + + Tens + + + Hundreds + + + Thousands + + + Tens of thousands + + + Millions + + + Billions + + + Trillions + + + Quadrillions + + + + + + SDMX Cross Domain Concept Scheme + + + + CONTEXT + There are a number of data collection methods used for + official statistics, including computer-aided personal or telephone interview CAPI/CATI, + mailed questionnaires, electronic or internet questionnaires and direct observation. The + data collection may be exclusively for statistical purposes, or primarily for + non-statistical purposes. Descriptions of data collection methods should include the + purpose for which the data were collected, the period the data refer to, the + classifications and definitions used, and any constraints related to further use of + these data. + + + + Data Collection Method + Systematic process of gathering data for official statistics. + + + + + + + + + CONTEXT + Confidentiality refers to a property of data with + respect to whether, for example, they are public or their disclosure is subject to + restrictions. For instance, data allowing the identification of a physical or legal + person, either directly or indirectly, may be characterised as confidential according to + the relevant national or international legislation. Unauthorised disclosure of data that + are restricted or confidential is not permitted and even legislative measures or other + formal provisions may be used to prevent disclosure. Often, there are procedures in + place to prevent disclosure of restricted or confidential data, including rules applying + to staff, aggregation rules when disseminating data, provision of unit records, etc. + Sensitivity (of information) is sometimes used as a synonym to confidentiality. This + concept can be broken down into: Confidentiality - policy; Confidentiality - status; + Confidentiality - data treatment. + + + + Confidentiality + A property of data indicating the extent to which their + unauthorised disclosure could be prejudicial or harmful to the interest of the source or other + relevant parties. + + + + + + + + + CONTEXT + This concept is related to data and determines the + exact status of the value. i.e. if a specific value is confidential or not. This concept + is always coded, i.e. it takes its value from the respective code list. A list of ID + broken down by attachment level is recommended: CONF_STATUS_DFL (dataflow), + CONF_STATUS_DSET (dataset), CONF_STATUS_GRP (group), CONF_STATUS_TS (series) or + CONF_STATUS_OBS (observation). + + + + Confidentiality - status + Information about the confidentiality status of the object to + which this attribute is attached. + + + + + + + urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_CONF_STATUS(1.0) + + + + + Currency + Monetary denomination of the object being measured. + + + + Decimals + The number of digits of an observation to the right of a decimal + point. + + + + urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_DECIMALS(1.0) + + + + + + + CONTEXT + If a data series has a constant time interval between + its observations, this interval determines the frequency of the series (e.g. monthly, + quarterly, yearly). "Frequency" - also called "periodicity" - may refer to several + stages in the production process, e.g. in data collection or in data dissemination. + (e.g., a time series could be available at annual frequency but the underlying data are + compiled monthly). Therefore, "Frequency" can be broken down into "Frequency - data + collection" and "Frequency - data dissemination". For data messages, the frequency is + represented through codes. Any additional detail needed (e.g. "weekly on Thursday") must + be inserted as free text within "Frequency detail". + + + + Frequency + The time interval at which observations occur over a given time + period. + + + + urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_FREQ(1.0) + + + + + + + CONTEXT + The "observation value" is the field which holds the + data. + + + + Observation value + The value of a particular variable at a particular period. + + + + + + + + + CONTEXT + This item is normally coded and uses codes providing + information about the status of a value, with respect to events such as "break", + "estimated value", "forecast", "missing value", or "provisional value". In some cases, + there is more than one event that may have influenced the value (e.g. a break in + methodology may be accompanied with the fact that an observation is an estimate). A + textual comment providing more detailed information on important events related to an + observation can be added via the attribute "Comment". + + + + Observation Status + Information on the quality of a value or an unusual or missing + value. + + + + urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_OBS_STATUS(1.0) + + + + + + + CONTEXT + The measurement represented by each observation + corresponds to a specific point in time (e.g. a single day) or a period (e.g. a month, a + fiscal year, or a calendar year). This is used as a time stamp and is of particular + importance for time series data. In cases where the actual time period of the data + differs from the target reference period, "time period" refers to the actual period. + + + + Time Period + The period of time or point in time to which the measured + observation refers. + + + + + + + + + CONTEXT + "Title" is a short name describing and identifying a + statistical object it is attached to. IN SDMX, a title can be referred, for example, to + a time series as a "time series title", or to a group as a "group title". A list of ID + broken down by attachment level is therefore needed: TITLE_TS, or TITLE_GRP. + + + + Title + Textual label used as identification of a statistical object. + + + + + + + + + CONTEXT + In some data bases, it is referred to as SCALE, + MAGNITUDE or POWER., e.g. "UM=6" means that observations are in millions. + + + + Unit Multiplier + Exponent in base 10 specified so that multiplying the observation + numeric values by 10^UNIT_MULT gives a value expressed in the UNIT. + + + + urn:sdmx:org.sdmx.infomodel.codelist.Codelist=SDMX:CL_UNIT_MULT(1.0) + + + + + + + CONTEXT + The unit of measure is a quantity or increment by which + something is counted or described, such as kg, mm, °C, °F, monetary units such as Euro + or US dollar, simple number counts or index numbers. The unit of measure has a type + (e.g. currency) and, in connection with the unit multiplier, provides the level of + detail for the value of the variable (e.g. Euro, 1000 Euro). For data messages, the + concept is always represented by codes. Any additional detail needed must be inserted as + free text within "unit of measure detail". + + + + Unit of Measure + The unit in which the data values are measured. + + + + + + \ No newline at end of file diff --git a/tests/io/xml/sdmx21/reader/test_reader.py b/tests/io/xml/sdmx21/reader/test_reader.py index 41e92e8..bcd0ff2 100644 --- a/tests/io/xml/sdmx21/reader/test_reader.py +++ b/tests/io/xml/sdmx21/reader/test_reader.py @@ -6,12 +6,26 @@ from pysdmx.io.input_processor import process_string_to_read from pysdmx.io.xml.enums import MessageType from pysdmx.io.xml.sdmx21.reader import read_xml +from pysdmx.model import Contact from pysdmx.model.message import SubmissionResult # Test parsing SDMX Registry Interface Submission Response +@pytest.fixture() +def agency_scheme_path(): + return Path(__file__).parent / "samples" / "agencies.xml" + +@pytest.fixture() +def codelist_path(): + return Path(__file__).parent / "samples" / "codelists.xml" + +@pytest.fixture() +def item_scheme_path(): + return Path(__file__).parent / "samples" / "item_scheme.xml" + + @pytest.fixture() def submission_path(): return Path(__file__).parent / "samples" / "submission_append.xml" @@ -22,6 +36,77 @@ def error_304_path(): return Path(__file__).parent / "samples" / "error_304.xml" +def test_agency_scheme_read(agency_scheme_path): + input_str, filetype = process_string_to_read(agency_scheme_path) + assert filetype == "xml" + result = read_xml(input_str, validate=True) + + assert "OrganisationSchemes" in result + agency_scheme = result["OrganisationSchemes"] + assert len(agency_scheme) == 1 + agency_sdmx = agency_scheme["SDMX:AGENCIES(1.0)"].items[0] + assert agency_sdmx.id == "SDMX" + assert agency_sdmx.name == "SDMX" + +def test_code_list_read(codelist_path): + input_str, filetype = process_string_to_read(codelist_path) + assert filetype == "xml" + result = read_xml(input_str, validate=True) + + assert "Codelists" in result + codelists = result["Codelists"] + assert len(codelists) == 5 + codelist_sdmx = codelists["SDMX:CL_UNIT_MULT(1.0)"] + assert codelist_sdmx.id == "CL_UNIT_MULT" + assert ( + codelist_sdmx.name == "code list for the Unit Multiplier (UNIT_MULT)" + ) + assert codelist_sdmx.items[0].id == "0" + assert codelist_sdmx.items[0].name == "Units" + + +def test_item_scheme_read(item_scheme_path): + input_str, filetype = process_string_to_read(item_scheme_path) + assert filetype == "xml" + result = read_xml(input_str, validate=True) + + assert "OrganisationSchemes" in result + assert "Codelists" in result + assert "Concepts" in result + + # Agency Scheme (OrganisationSchemes) assertions + agency_scheme = result["OrganisationSchemes"] + assert len(agency_scheme) == 1 + agency_sdmx = agency_scheme["SDMX:AGENCIES(1.0)"].items[0] + assert agency_sdmx.id == "SDMX" + assert agency_sdmx.name == "SDMX" + agency_uis = agency_scheme["SDMX:AGENCIES(1.0)"].items[2] + + assert agency_uis.id == "UIS" + assert isinstance(agency_uis.contacts[0], Contact) + assert agency_uis.contacts[0].emails == ["uis.datarequests@unesco.org"] + + # Codelist + codelists = result["Codelists"] + assert len(codelists) == 5 + codelist_sdmx = codelists["SDMX:CL_UNIT_MULT(1.0)"] + assert codelist_sdmx.id == "CL_UNIT_MULT" + assert ( + codelist_sdmx.name == "code list for the Unit Multiplier (UNIT_MULT)" + ) + assert codelist_sdmx.items[0].id == "0" + assert codelist_sdmx.items[0].name == "Units" + + # Concept + concepts = result["Concepts"] + assert len(concepts) == 1 + concept_scheme_sdmx = concepts["SDMX:CROSS_DOMAIN_CONCEPTS(1.0)"] + assert concept_scheme_sdmx.id == "CROSS_DOMAIN_CONCEPTS" + assert concept_scheme_sdmx.name == "SDMX Cross Domain Concept Scheme" + assert concept_scheme_sdmx.items[0].id == "COLL_METHOD" + assert concept_scheme_sdmx.items[2].codes[0].id == "C" + + def test_submission_result(submission_path): input_str, filetype = process_string_to_read(submission_path) assert filetype == "xml" diff --git a/tests/model/test_message.py b/tests/model/test_message.py index 41f690d..698e15f 100644 --- a/tests/model/test_message.py +++ b/tests/model/test_message.py @@ -47,7 +47,7 @@ def test_get_concepts(): cs1 = ConceptScheme(id="cs1", agency="cs1") message = Message( { - "ConceptSchemes": { + "Concepts": { "cs1:cs1(1.0)": cs1, } } @@ -74,7 +74,7 @@ def test_empty_get_elements(): with pytest.raises(NotFound) as exc_info: message.get_concept_schemes() - assert "No ConceptSchemes found" in str(exc_info.value.title) + assert "No Concepts found" in str(exc_info.value.title) def test_empty_get_element_by_uid(): @@ -92,7 +92,7 @@ def test_empty_get_element_by_uid(): with pytest.raises(NotFound) as exc_info: message.get_concept_scheme_by_uid("cs1:cs1(1.0)") - assert "No ConceptSchemes found" in str(exc_info.value.title) + assert "No Concepts found" in str(exc_info.value.title) def test_invalid_get_element_by_uid(): @@ -117,7 +117,7 @@ def test_invalid_initialization_content_key(): [ ("OrganisationSchemes", {"org1:orgs1(1.0)": "invalid"}), ("Codelists", {"cl1:cl1(1.0)": "invalid"}), - ("ConceptSchemes", {"cs1:cs1(1.0)": "invalid"}), + ("Concepts", {"cs1:cs1(1.0)": "invalid"}), ], ) def test_invalid_initialization_content_value(key, value): From e62ba18b0af8bde07094851ebcc2cfe239665d45 Mon Sep 17 00:00:00 2001 From: "javier.hernandez" Date: Mon, 24 Jun 2024 19:00:14 +0200 Subject: [PATCH 3/3] Flake8 changes tests. Signed-off-by: javier.hernandez --- tests/io/xml/sdmx21/reader/test_reader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/io/xml/sdmx21/reader/test_reader.py b/tests/io/xml/sdmx21/reader/test_reader.py index bcd0ff2..ee62c6c 100644 --- a/tests/io/xml/sdmx21/reader/test_reader.py +++ b/tests/io/xml/sdmx21/reader/test_reader.py @@ -17,10 +17,12 @@ def agency_scheme_path(): return Path(__file__).parent / "samples" / "agencies.xml" + @pytest.fixture() def codelist_path(): return Path(__file__).parent / "samples" / "codelists.xml" + @pytest.fixture() def item_scheme_path(): return Path(__file__).parent / "samples" / "item_scheme.xml" @@ -48,6 +50,7 @@ def test_agency_scheme_read(agency_scheme_path): assert agency_sdmx.id == "SDMX" assert agency_sdmx.name == "SDMX" + def test_code_list_read(codelist_path): input_str, filetype = process_string_to_read(codelist_path) assert filetype == "xml"