# Parse the XML file for HQTA files

* [Edit metadata for many ArcGIS items](https://desktop.arcgis.com/en/arcmap/latest/manage-data/metadata/editing-metadata-for-many-arcgis-items.htm)
* [Change to JSON](https://stackoverflow.com/questions/48821725/xml-parsers-expat-expaterror-not-well-formed-invalid-token)

In [1]:
#!pip install xmltodict

In [2]:
import xml.etree.ElementTree as ET
import json
import xmltodict

In [3]:
XML_FILE = "./data/ca_hq_transit_areas.xml"

# read in XML
tree = ET.parse(XML_FILE)
root = tree.getroot()

# Not well-formed XML?
# Figure out another way to read in as a dict
#for elem in tree.iter():
#    print(elem.tag, elem.attrib)

In [4]:
def xml_to_json(path: str) -> dict:  
    try:
        print(f"Loading XML as JSON from {path}")
        xml = ET.tostring(ET.parse(path).getroot())
        return xmltodict.parse(xml, 
                               #attr_prefix="@", cdata_key="#text", 
                               dict_constructor=dict)
    except:
        print(f"Loading failed for {path}")
    return {}

esri_metadata = xml_to_json(XML_FILE)

Loading XML as JSON from ./data/ca_hq_transit_areas.xml


## Supply a dictionary to input the values we want to change

Analyst overwrites values in the dictionary.

Other steps to read in XML, convert to JSON, overwrite values, go back to XML...those can be automated.

In [5]:
def fill_in_keyword_list(topic='transportation', keyword_list = []):
    filled_out_list = [
        {'themekt': 'ISO 19115 Topic Categories',
         'themekey': topic},
         {'themekt': 'None',
          'themekey': keyword_list
         }
    ]
    
    return filled_out_list

In [6]:
theme_keywords = ["Transit", "Land Use", "GTFS", 
                  "Transit-Oriented Development", "TOD"]

topics_list = fill_in_keyword_list("transportation", theme_keywords)
topics_list

[{'themekt': 'ISO 19115 Topic Categories', 'themekey': 'transportation'},
 {'themekt': 'None',
  'themekey': ['Transit',
   'Land Use',
   'GTFS',
   'Transit-Oriented Development',
   'TOD']}]

In [7]:
#esri_metadata["metadata"]["idinfo"]["keywords"]["theme"]

In [8]:
DATASET_INFO = {
    ## my_json["metadata"]["idinfo"]
    # ["citation"]
    "dataset_name": "ca_hq_transit_areas", #["citeinfo"]["title"]
    "publish_entity": "California Integrated Travel Project", #["pubinfo"]["publish"]
    # ["descript"]
    "abstract": "Public. EPSG: 3310", #["abstract"]
    "purpose": "Summary sentence about dataset.", #["purpose"]
    # ["timeperd"]
    "beginning_date": "YYYYMMDD", #["timeinfo"]["rngdates"]["begdate"]
    "end_date": "YYYYMMDD", #["timeinfo"]["rngdates"]["enddate"]
    "place": "California", #["current"]
    # ["status"]
    "status": "Complete", #["progress"]
    "frequency": "Monthly", #["update"]
    # ["keywords"]
    # transportation is probably what we always want to check off
    #fill_in_keyword_list(topic = "transportation", theme_keywords = []),
    "theme_topics": topics_list, 
    ## my_json["metadata"]["dataqual"]
    "methodology": "Detailed methodology description", #["lineage"]["procstep"]["procdesc"]
    ## my_json["metadata"]["eainfo"]
    # ["detailed"] (this outputs a list)
    ## set this equal to dataset_name above ["detailed"][0]["enttyp"]["enttypl"] 
    "data_dict_type": "CSV", # ["detailed"][1]["enttyp"]["enttypd"],
    "data_dict_url": "some_url", # ["detailed"][1]["enttyp"]["enttypds"],
    ## my_json["metadata"]["metainfo"]
    #["metc"]
    "contact_organization": "Caltrans", #["metc"]["cntinfo"]["cntorgp"]["cntorg"]
    "contact_person": "Analyst Name", #["metc"]["cntinfo"]["cntorgp"]["cntper"]
    #"contact_position": "California Integrated Travel Project", #["metc"]["cntinfo"]["cntpos"]
    "contact_email": "hello@calitp.org" #["metc"]["cntinfo"]["cntemail"],
}

In [9]:
esri_metadata

{'metadata': {'idinfo': {'citation': {'citeinfo': {'title': 'ca_hq_transit_areas',
     'geoform': 'vector digital data',
     'pubinfo': {'publish': 'California Integrated Travel Project'}}},
   'descript': {'abstract': 'Public. EPSG: 3310',
    'purpose': 'Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2'},
   'timeperd': {'timeinfo': {'rngdates': {'begdate': '20220517',
      'enddate': '20220617'}},
    'current': 'California'},
   'status': {'progress': 'Complete', 'update': 'Monthly'},
   'spdom': {'bounding': {'westbc': '-124.423914',
     'eastbc': '-113.465232',
     'northbc': '45.362934',
     'southbc': '32.441061'}},
   'keywords': {'theme': [{'themekt': 'ISO 19115 Topic Categories',
      'themekey': 'transportation'},
     {'themekt': 'None',
      'themekey': ['Transportation',
       'Land Use',
       'Transit-Oriented Development',
       'TOD',
       'High Quality Transit']}]},
   'accconst': 'None',
   'useconst': 

## Overwrite the JSON with new values

Create a new dictionary and see

In [10]:
def overwrite_metadata_json(metadata_json, DATASET_INFO):
    d = DATASET_INFO
    new_metadata = metadata_json.copy()
    m = new_metadata["metadata"]
    
    m["idinfo"]["citation"]["citeinfo"]["title"] = d["dataset_name"]
    m["idinfo"]["citation"]["citeinfo"]["pubinfo"]["publish"] = d["publish_entity"]
    
    m["idinfo"]["descript"]["abstract"] = d["abstract"]
    m["idinfo"]["descript"]["purpose"] = d["purpose"]
    
    m["idinfo"]["timeperd"]["timeinfo"]["rngdates"]["begdate"] = d["beginning_date"]
    m["idinfo"]["timeperd"]["timeinfo"]["rngdates"]["enddate"] = d["end_date"]
    m["idinfo"]["timeperd"]["current"] = d["place"]
    
    m["idinfo"]["status"]["progress"] = d["status"]
    m["idinfo"]["status"]["update"] = d["frequency"]

    m["idinfo"]["keywords"] = d["theme_topics"]    

    m["dataqual"]["lineage"]["procstep"]["procdesc"] = d["methodology"]    
    
    m["eainfo"]["detailed"][0]["enttyp"]["enttypl"] = d["dataset_name"]    
    m["eainfo"]["detailed"][1]["enttyp"]["enttypd"] = d["data_dict_type"]    
    m["eainfo"]["detailed"][1]["enttyp"]["enttypds"] = d["data_dict_url"]    
  
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntorg"] = d["contact_organization"]    
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]    
    m["metainfo"]["metc"]["cntinfo"]["cntpos"] = d["publish_entity"]    
    m["metainfo"]["metc"]["cntinfo"]["cntemail"] = d["contact_email"]    

    return new_metadata 

In [11]:
new_meta = overwrite_metadata_json(esri_metadata, DATASET_INFO)
new_meta

{'metadata': {'idinfo': {'citation': {'citeinfo': {'title': 'ca_hq_transit_areas',
     'geoform': 'vector digital data',
     'pubinfo': {'publish': 'California Integrated Travel Project'}}},
   'descript': {'abstract': 'Public. EPSG: 3310',
    'purpose': 'Summary sentence about dataset.'},
   'timeperd': {'timeinfo': {'rngdates': {'begdate': 'YYYYMMDD',
      'enddate': 'YYYYMMDD'}},
    'current': 'California'},
   'status': {'progress': 'Complete', 'update': 'Monthly'},
   'spdom': {'bounding': {'westbc': '-124.423914',
     'eastbc': '-113.465232',
     'northbc': '45.362934',
     'southbc': '32.441061'}},
   'keywords': [{'themekt': 'ISO 19115 Topic Categories',
     'themekey': 'transportation'},
    {'themekt': 'None',
     'themekey': ['Transit',
      'Land Use',
      'GTFS',
      'Transit-Oriented Development',
      'TOD']}],
   'accconst': 'None',
   'useconst': 'None',
   'native': 'Version 6.2 (Build 9200) ; Esri ArcGIS 10.8.1.14362'},
  'dataqual': {'lineage': {'pro

## Convert JSON back to XML

Take existing values if it's not being replaced

In [12]:
#esri_metadata["metadata"]["eainfo"]["detailed"][1]

In [13]:
#for k, v in esri_metadata["metadata"]["metainfo"].items():
#    print(f"Key: {k}")
#    print(v)        