In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xmltodict

from pydantic import BaseModel
from typing import List, Dict

import validation 

METADATA_FOLDER = "metadata_xml/"



In [2]:
def xml_to_json(path: str) -> dict:  
    try:
        print(f"Loading XML as JSON from {path}")
        xml = ET.tostring(ET.parse(path).getroot())
        return xmltodict.parse(xml, 
                               attr_prefix="", cdata_key="text", 
                               #process_namespaces=True,
                               dict_constructor=dict)
    except:
        print(f"Loading failed for {path}")
    return {}

In [3]:
DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
default_template = xml_to_json(DEFAULT_XML)

Loading XML as JSON from ./metadata_xml/default_pro.xml


In [5]:
XML_FILE = f"./ca_transit_routes.xml"
esri_metadata = xml_to_json(XML_FILE)

Loading XML as JSON from ./ca_transit_routes.xml


In [4]:
def lift_necessary_dataset_elements(metadata_json: dict) -> dict:
    m = metadata_json["ns0:MD_Metadata"]
    
    # Store this info in a dictionary
    d = {}
        
    # Date Stamp
    d["ns0:dateStamp"] = m["ns0:dateStamp"] 
    
    # Spatial Representation Info
    d["ns0:spatialRepresentationInfo"] = m["ns0:spatialRepresentationInfo"] 
   
    # Coordinate Reference System Info
    d["ns0:referenceSystemInfo"] = m["ns0:referenceSystemInfo"] 
    
    # Distribution Info
    d["ns0:distributionInfo"] = m["ns0:distributionInfo"]   
        
    # Identification info
    d["ns0:identificationInfo"] = m["ns0:identificationInfo"]
    
    return d

In [11]:
necessary_elements = lift_necessary_dataset_elements(esri_metadata)
necessary_elements

{'ns0:dateStamp': {'ns1:Date': '2022-10-06'},
 'ns0:spatialRepresentationInfo': {'ns0:MD_VectorSpatialRepresentation': {'ns0:topologyLevel': {'ns0:MD_TopologyLevelCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_TopologyLevelCode',
     'codeListValue': 'geometryOnly',
     'codeSpace': 'ISOTC211/19115',
     'text': 'geometryOnly'}},
   'ns0:geometricObjects': {'ns0:MD_GeometricObjects': {'ns0:geometricObjectType': {'ns0:MD_GeometricObjectTypeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_GeometricObjectTypeCode',
       'codeListValue': 'composite',
       'codeSpace': 'ISOTC211/19115',
       'text': 'composite'}}}}}},
 'ns0:referenceSystemInfo': {'ns0:MD_ReferenceSystem': {'ns0:referenceSystemIdentifier': {'ns0:RS_Identifier': {'ns0:code': {'ns1:CharacterString': '4326'},
     'ns0:codeSpace': {'ns1:CharacterString': 'EPSG'},
     'ns0:version': {'ns1:CharacterString': '6.2(3.0.1)'}}}}},
 'ns0:distrib

In [9]:
def overwrite_default_with_dataset_elements(metadata_json: dict) -> dict:
    DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
    default_template = xml_to_json(DEFAULT_XML)
    default = default_template["ns0:MD_Metadata"]
    
    # Grab the necessary elements from my dataset
    necessary_elements = lift_necessary_dataset_elements(metadata_json)
    
    # Overwrite it in the default template
    for key, value in default.items():
        if key in necessary_elements.keys():
            default[key] = necessary_elements[key]     
            
    # Return the default template, but now with our dataset's info populated
    return default_template

In [10]:
overwrite_default_with_dataset_elements(esri_metadata)

Loading XML as JSON from ./metadata_xml/default_pro.xml


{'ns0:MD_Metadata': {'xmlns:ns0': 'http://www.isotc211.org/2005/gmd',
  'xmlns:ns1': 'http://www.isotc211.org/2005/gco',
  'xmlns:ns2': 'http://www.opengis.net/gml/3.2',
  'ns0:language': {'ns0:LanguageCode': {'codeList': 'http://www.loc.gov/standards/iso639-2/php/code_list.php',
    'codeListValue': 'eng',
    'codeSpace': 'ISO639-2',
    'text': 'eng'}},
  'ns0:characterSet': {'ns0:MD_CharacterSetCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_CharacterSetCode',
    'codeListValue': 'utf8',
    'codeSpace': 'ISOTC211/19115',
    'text': 'utf8'}},
  'ns0:hierarchyLevel': {'ns0:MD_ScopeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_ScopeCode',
    'codeListValue': 'dataset',
    'codeSpace': 'ISOTC211/19115',
    'text': 'dataset'}},
  'ns0:hierarchyLevelName': {'ns1:CharacterString': 'dataset'},
  'ns0:contact': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'My Name'},
    '

In [None]:
#explore = "ns0:identificationInfo"
#esri_metadata["ns0:MD_Metadata"][explore]