# Flatten ISO dict, remove some string, then nest again

In [1]:
import xml.etree.ElementTree as ET
import xmltodict

In [2]:
def xml_to_json(path: str) -> dict:  
    try:
        print(f"Loading XML as JSON from {path}")
        xml = ET.tostring(ET.parse(path).getroot())
        
        return xmltodict.parse(xml, 
                                  #https://stackoverflow.com/questions/35452588/remove-special-characters-from-keys-of-a-parsed-xml-file-using-xmltodict
                               attr_prefix="", cdata_key="text", 
                               process_namespaces=True,
                               # This isn't stripping it
                               #namespaces = {
                               #    "http://www.isotc211.org/2005/gmd:": ""
                               #}, 
                               # Instead, do something similar in the dictionary like this
                               # https://github.com/GeosoftInc/gxpy/blob/master/geosoft/gxpy/utility.py
                               dict_constructor=dict)
        
    except:
        print(f"Loading failed for {path}")
    return {}

In [3]:
XML_FILE = "./metadata_xml/ca_hq_transit_stops.xml"
orig_metadata = xml_to_json(XML_FILE)

Loading XML as JSON from ./metadata_xml/ca_hq_transit_stops.xml


In [5]:
#orig_metadata

In [6]:
# https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys
from collections.abc import MutableMapping
import re

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [7]:
flat_metadata = flatten(orig_metadata)

In [9]:
#flat_metadata

In [10]:
p = 'http://www.isotc211.org/2005/gmd:'

flat_metadata_clean = {re.sub(p, '', str(k)): re.sub(p, '', str(v)) 
         for k, v in flat_metadata.items()}

In [12]:
#flat_metadata_clean

In [13]:
#https://stackoverflow.com/questions/50607128/creating-a-nested-dictionary-from-a-flattened-dictionary

from collections import defaultdict
from functools import reduce
from operator import getitem

def getFromDict(dataDict, mapList):
    """Iterate nested dictionary"""
    return reduce(getitem, mapList, dataDict)

# instantiate nested defaultdict of defaultdicts
tree = lambda: defaultdict(tree)
d = tree()

# iterate input dictionary
for k, v in flat_metadata_clean.items():
    *keys, final_key = k.split('_')
    getFromDict(d, keys)[final_key] = v

In [14]:
def default_to_regular_dict(d):
    """Convert nested defaultdict to regular dict of dicts."""
    if isinstance(d, defaultdict):
        d = {k: default_to_regular_dict(v) for k, v in d.items()}
    return d

# convert back to regular dict
nested_metadata = default_to_regular_dict(d)

In [24]:
m = nested_metadata["MD"]["Metadata"]

In [40]:
m["contact"]["CI"]

{'ResponsibleParty': {'individualName': {'http://www.isotc211.org/2005/gco:CharacterString': 'Eric Dasmalchi'},
  'organisationName': {'http://www.isotc211.org/2005/gco:CharacterString': 'Caltrans'},
  'positionName': {'http://www.isotc211.org/2005/gco:CharacterString': 'California Integrated Travel Project'},
  'contactInfo': {'CI': {'Contact': {'address': {'CI': {'Address': {'electronicMailAddress': {'http://www.isotc211.org/2005/gco:CharacterString': 'eric.dasmalchi@dot.ca.gov'}}}}}}},
  'role': {'CI': {'RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode',
     'codeListValue': 'pointOfContact',
     'codeSpace': 'ISOTC211/19115',
     'text': 'pointOfContact'}}}}}

In [37]:
show_me = ["contact"]
for key, value in m.items():
    if key in show_me:
        #print(f"Key: {key}")
        #print(value)
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"k2: {k}")
                print(f"v: {v}")
        else:
            pass
            


k2: CI
v: {'ResponsibleParty': {'individualName': {'http://www.isotc211.org/2005/gco:CharacterString': 'Eric Dasmalchi'}, 'organisationName': {'http://www.isotc211.org/2005/gco:CharacterString': 'Caltrans'}, 'positionName': {'http://www.isotc211.org/2005/gco:CharacterString': 'California Integrated Travel Project'}, 'contactInfo': {'CI': {'Contact': {'address': {'CI': {'Address': {'electronicMailAddress': {'http://www.isotc211.org/2005/gco:CharacterString': 'eric.dasmalchi@dot.ca.gov'}}}}}}}, 'role': {'CI': {'RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode', 'codeListValue': 'pointOfContact', 'codeSpace': 'ISOTC211/19115', 'text': 'pointOfContact'}}}}}


In [28]:
m.keys()

dict_keys(['language', 'characterSet', 'hierarchyLevel', 'hierarchyLevelName', 'contact', 'dateStamp', 'metadataStandardName', 'metadataStandardVersion', 'spatialRepresentationInfo', 'referenceSystemInfo', 'identificationInfo', 'distributionInfo', 'dataQualityInfo'])

In [31]:
#orig_metadata

## Can get it nested again - overwrite while it's flattened

In [19]:
from metadata_update import fill_in_keyword_list

KEYWORDS = [
    'Transportation',
    'Land Use',
    'Transit-Oriented Development',
    'TOD',
    'High Quality Transit'
]

KEYWORDS_FORMATTED = fill_in_keyword_list(
    topic='transportation', keyword_list = KEYWORDS)

PURPOSE = ('''
    Estimated High Quality Transit Areas as described in 
    Public Resources Code 21155, 21064.3, 21060.2.
    '''
)

METHODOLOGY = ('''
    This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of “bus rapid transit” in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables.
    '''
)

HQTA_TRANSIT_AREAS_DICT = {
    "dataset_name": "ca_hq_transit_areas", 
    "publish_entity": "California Integrated Travel Project", 

    "abstract": "Public. EPSG: 3310",
    "purpose": PURPOSE, 

    "beginning_date": "20220517",
    "end_date": "20220617",
    "place": "California",

    "status": "Complete", 
    "frequency": "Monthly",
    
    "theme_topics": KEYWORDS_FORMATTED, 

    "methodology": METHODOLOGY, 
    
    "data_dict_type": "CSV",
    "data_dict_url": "some_url", 

    "contact_organization": "Caltrans", 
    "contact_person": "Eric Dasmalchi", 
    "contact_email": "eric.dasmalchi@dot.ca.gov" 
    
    # Add 2 other elements not in Caltrans documentation
    # resource contact (same info as metadata contact)
    # edition: use date, rather than number
}

In [17]:
flat_metadata_clean

{'MD_Metadata_language_LanguageCode_codeList': 'http://www.loc.gov/standards/iso639-2/php/code_list.php',
 'MD_Metadata_language_LanguageCode_codeListValue': 'eng',
 'MD_Metadata_language_LanguageCode_codeSpace': 'ISO639-2',
 'MD_Metadata_language_LanguageCode_xmlns_ns0': 'http://www.isotc211.org/2005/gmd',
 'MD_Metadata_language_LanguageCode_xmlns_ns1': 'http://www.isotc211.org/2005/gco',
 'MD_Metadata_language_LanguageCode_xmlns_ns2': 'http://www.opengis.net/gml',
 'MD_Metadata_language_LanguageCode_text': 'eng',
 'MD_Metadata_characterSet_MD_CharacterSetCode_codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_CharacterSetCode',
 'MD_Metadata_characterSet_MD_CharacterSetCode_codeListValue': 'utf8',
 'MD_Metadata_characterSet_MD_CharacterSetCode_codeSpace': 'ISOTC211/19115',
 'MD_Metadata_characterSet_MD_CharacterSetCode_text': 'utf8',
 'MD_Metadata_hierarchyLevel_MD_ScopeCode_codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_S