# ISO 19139 GML 3.2 metadata format

Steps:
1. See what's in the filled out XML
1. See what's in the sparse XML
1. What sections need to be lifted to be part of the `default.xml`
1. What parts belong to a dataset that need to follow it, even after the template is applied?
1. What parts need to be overwritten?

In [1]:
import xml.etree.ElementTree as ET
import xmltodict 

In [2]:
FILLED_OUT_XML_FILE = "ca_hq_transit_stops.xml"
SPARSE_XML_FILE = "ca_transit_routes.xml"

In [3]:
def xml_to_json(path: str) -> dict:  
    try:
        print(f"Loading XML as JSON from {path}")
        xml = ET.tostring(ET.parse(path).getroot())
        return xmltodict.parse(xml, 
                               attr_prefix="", cdata_key="text", 
                               #process_namespaces=True,
                               #namespaces={"ns:0", ""},
                               dict_constructor=dict)
    except:
        print(f"Loading failed for {path}")
    return {}

In [4]:
filled_meta = xml_to_json(FILLED_OUT_XML_FILE)
sparse_meta = xml_to_json(SPARSE_XML_FILE)

Loading XML as JSON from ca_hq_transit_stops.xml
Loading XML as JSON from ca_transit_routes.xml


In [5]:
filled_meta2 = filled_meta["ns0:MD_Metadata"]
sparse_meta2 = sparse_meta["ns0:MD_Metadata"]

In [6]:
def compare_dict_items(my_dict1: dict, my_dict2: dict):
    
    exactly_the_same = ["xmlns:ns0", "xmlns:ns1", "ns0:language", 
                 "ns0:characterSet", "ns0:hierarchyLevel", 
                 "ns0:hierarchyLevelName", "ns0:metadataStandardName",
                 "ns0:metadataStandardVersion",
                ]
    
    # needs default template applied, then replace some values
    needs_default_template_first_then_replace = ["ns0:contact", 
                                                 "ns0:dataQualityInfo",
                                                 "ns0:identificationInfo"
                                                ]
    
    # completely missing, perhaps can use default template
    missing_use_default_template = ["xmlns:ns2"]
    
    # no template needed, just replace as is
    already_present_just_lift = ["ns0:dateStamp", 
                                 "ns0:spatialRepresentationInfo", 
                                 "ns0:referenceSystemInfo",
                                 "ns0:distributionInfo"
                                ]
    
    
    for key, value in my_dict1.items():
        if ((key not in exactly_the_same) and 
            (key not in needs_default_template_first_then_replace) and 
            (key not in already_present_just_lift) and 
            (key not in missing_use_default_template)
           ):
            print(f"**********{key}**********")        
            print("filled meta")
            print(my_dict1[key])
            print("sparse meta")
            if my_dict2.get(key) is not None:
                print(my_dict2[key])
            else: 
                print(f"MISSING KEY {key}")

In [7]:
compare_dict_items(filled_meta2, sparse_meta2)

**********ns0:identificationInfo**********
filled meta
{'ns0:MD_DataIdentification': {'ns0:citation': {'ns0:CI_Citation': {'ns0:title': {'ns1:CharacterString': 'ca_hq_transit_stops'}, 'ns0:date': {'ns0:CI_Date': {'ns0:date': {'ns1:Date': '2022-09-14'}, 'ns0:dateType': {'ns0:CI_DateTypeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_DateTypeCode', 'codeListValue': 'creation', 'codeSpace': 'ISOTC211/19115', 'text': 'creation'}}}}, 'ns0:citedResponsibleParty': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'My Name'}, 'ns0:organisationName': {'ns1:CharacterString': 'Caltrans'}, 'ns0:role': {'ns0:CI_RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode', 'codeListValue': 'publisher', 'codeSpace': 'ISOTC211/19115', 'text': 'publisher'}}}}, 'ns0:presentationForm': {'ns0:CI_PresentationFormCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_

In [19]:
#ns0:identificationInfo is bulk of where changes need to be made
# both because it has dataset specific stuff
# but also stuff that needs to be overwritten
def compare_id_info(my_dict1: dict, my_dict2: dict):
    dict1 = my_dict1["ns0:identificationInfo"]["ns0:MD_DataIdentification"]
    dict2 = my_dict2["ns0:identificationInfo"]["ns0:MD_DataIdentification"]
    
    exactly_the_same = []
    
    for key, value in dict1.items():
        
         if ((key not in exactly_the_same) #and 
            #(key not in needs_default_template_first_then_replace) and 
            #(key not in already_present_just_lift) and 
            #(key not in missing_use_default_template)
           ):
            print(f"**********{key}**********")        
            print("filled meta")
            print(dict1[key])
            print("sparse meta")
            if dict2.get(key) is not None:
                print(dict2[key])
            else: 
                print(f"MISSING KEY {key}")


In [21]:
filled_meta3 = filled_meta2["ns0:identificationInfo"]["ns0:MD_DataIdentification"
sparse_meta3 = sparse_meta2["ns0:identificationInfo"]["ns0:MD_DataIdentification"]

In [30]:
# need to copy part of citation
filled_meta3

{'ns0:citation': {'ns0:CI_Citation': {'ns0:title': {'ns1:CharacterString': 'ca_hq_transit_stops'},
   'ns0:date': {'ns0:CI_Date': {'ns0:date': {'ns1:Date': '2022-09-14'},
     'ns0:dateType': {'ns0:CI_DateTypeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_DateTypeCode',
       'codeListValue': 'creation',
       'codeSpace': 'ISOTC211/19115',
       'text': 'creation'}}}},
   'ns0:citedResponsibleParty': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'My Name'},
     'ns0:organisationName': {'ns1:CharacterString': 'Caltrans'},
     'ns0:role': {'ns0:CI_RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode',
       'codeListValue': 'publisher',
       'codeSpace': 'ISOTC211/19115',
       'text': 'publisher'}}}},
   'ns0:presentationForm': {'ns0:CI_PresentationFormCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_PresentationFormCod

In [28]:
filled_meta3["ns0:citation"]['ns0:CI_Citation']

{'ns0:title': {'ns1:CharacterString': 'ca_hq_transit_stops'},
 'ns0:date': {'ns0:CI_Date': {'ns0:date': {'ns1:Date': '2022-09-14'},
   'ns0:dateType': {'ns0:CI_DateTypeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_DateTypeCode',
     'codeListValue': 'creation',
     'codeSpace': 'ISOTC211/19115',
     'text': 'creation'}}}},
 'ns0:citedResponsibleParty': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'My Name'},
   'ns0:organisationName': {'ns1:CharacterString': 'Caltrans'},
   'ns0:role': {'ns0:CI_RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode',
     'codeListValue': 'publisher',
     'codeSpace': 'ISOTC211/19115',
     'text': 'publisher'}}}},
 'ns0:presentationForm': {'ns0:CI_PresentationFormCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_PresentationFormCode',
   'codeListValue': 'mapDigital',
   'codeSpace': 'ISOTC2

In [29]:
sparse_meta3["ns0:citation"]["ns0:CI_Citation"]

{'ns0:title': {'ns1:CharacterString': 'ca_transit_routes'},
 'ns0:date': {'ns1:nilReason': 'missing'},
 'ns0:presentationForm': {'ns0:CI_PresentationFormCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_PresentationFormCode',
   'codeListValue': 'mapDigital',
   'codeSpace': 'ISOTC211/19115',
   'text': 'mapDigital'}}}

In [20]:
compare_id_info(filled_meta2, sparse_meta2)

**********ns0:citation**********
filled meta
{'ns0:CI_Citation': {'ns0:title': {'ns1:CharacterString': 'ca_hq_transit_stops'}, 'ns0:date': {'ns0:CI_Date': {'ns0:date': {'ns1:Date': '2022-09-14'}, 'ns0:dateType': {'ns0:CI_DateTypeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_DateTypeCode', 'codeListValue': 'creation', 'codeSpace': 'ISOTC211/19115', 'text': 'creation'}}}}, 'ns0:citedResponsibleParty': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'My Name'}, 'ns0:organisationName': {'ns1:CharacterString': 'Caltrans'}, 'ns0:role': {'ns0:CI_RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode', 'codeListValue': 'publisher', 'codeSpace': 'ISOTC211/19115', 'text': 'publisher'}}}}, 'ns0:presentationForm': {'ns0:CI_PresentationFormCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_PresentationFormCode', 'codeListValue': 'mapDigital', 'co

In [None]:
explore_me = "ns0:identificationInfo"
filled_meta2[explore_me]['ns0:MD_DataIdentification']

In [None]:
sparse_meta2[explore_me]['ns0:MD_DataIdentification']

In [None]:
# Lift necessary stuff from 1st time through shp to file gdb
def lift_missing_dataset_elements(metadata_json):
    m = metadata_json["ns0:MD_Metadata"]
    
    # Store this info in a dictionary
    d = {}
    
    # Data Quality
    d['ns0:dataQualityInfo'] = m['ns0:dataQualityInfo']
    
    # metadata info
    d['xmlns:ns2'] = m['xmlns:ns2']
    
    return d

lift_missing_dataset_elements(meta)

In [None]:
from pydantic import BaseModel
from typing import Dict, List

class metadata_input(BaseModel):
    dataset_name: str
    publish_entity: str = "California Integrated Travel Project"
    abstract: str
    purpose: str
    beginning_date: str
    end_date: str
    place: str = "California"
    status: str = "Complete"
    frequency: str = "Monthly"
    theme_topics: Dict
    methodology: str
    data_dict_type: str
    data_dict_url: str
    contact_organization: str = "Caltrans"
    contact_person: str
    contact_email: str = "hello@calitp.org"
    horiz_accuracy: str = "0.00004 decimal degrees"