In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xmltodict

from pydantic import BaseModel
from typing import List, Dict

import validation_pro 

METADATA_FOLDER = "metadata_xml/"



In [2]:
import metadata_update_desktop

DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)

Loading XML as JSON from ./metadata_xml/default_pro.xml


Reminder to check out `uuidref`...if we keep lifting template, we would copy this part over.
```
default2 = default_template["ns0:MD_Metadata"]
(default2["ns0:identificationInfo"]
 ["ns0:MD_DataIdentification"]["ns0:descriptiveKeywords"][0]["ns0:MD_Keywords"]["ns0:thesaurusName"]
)
```

In [3]:
XML_FILE = f"./ca_transit_routes.xml"
esri_metadata = metadata_update_desktop.xml_to_json(XML_FILE)

Loading XML as JSON from ./ca_transit_routes.xml


In [4]:
def lift_necessary_dataset_elements(metadata_json: dict) -> dict:
    
    m = metadata_json["ns0:MD_Metadata"]
    
    # This namespace prefix is used a lot
    x = "ns0:"
    
    # Store this info in a dictionary
    d = {}
        
    # Date Stamp
    d[f"{x}dateStamp"] = m[f"{x}dateStamp"] 
    
    # Spatial Representation Info
    d[f"{x}spatialRepresentationInfo"] = m[f"{x}spatialRepresentationInfo"] 
   
    # Coordinate Reference System Info
    d[f"{x}referenceSystemInfo"] = m[f"{x}referenceSystemInfo"] 
    
    # Distribution Info
    d[f"{x}distributionInfo"] = m[f"{x}distributionInfo"]   
    
    return d

In [5]:
def overwrite_default_with_dataset_elements(metadata_json: dict) -> dict:
    DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
    default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)
    default = default_template["ns0:MD_Metadata"]
    
    # Grab the necessary elements from my dataset
    necessary_elements = lift_necessary_dataset_elements(metadata_json)
    
    # Overwrite it in the default template
    for key, value in default.items():
        if key in necessary_elements.keys():
            default[key] = necessary_elements[key] 
        else:
            default[key] = default[key]
            
    # Return the default template, but now with our dataset's info populated
    return default_template

In [22]:
esri2 = overwrite_default_with_dataset_elements(esri_metadata)

Loading XML as JSON from ./metadata_xml/default_pro.xml



### Now find all instances where what's inputted in BaseModel is used

In [7]:
ROUTES_DICT = {
    "dataset_name": "ca_transit_routes", 
    "publish_entity": "California Integrated Travel Project", 

    "abstract": "Public. EPSG: 4326",
    "purpose": "my purpose", # purpose, also repeat for description?

    "beginning_date": "2022/09/14",
    "end_date": "2022-10-14",
    "place": "California",

    "status": "completed", 
    "frequency": "monthly",
    
    "theme_keywords": ["1", "2", "3", "4", "5"], #tags

    "methodology": "this is my methodology", 
    "contact_organization": "Caltrans", 
    "contact_person": "Tiffany Chu", 
    "contact_email": "tiffany.chu@dot.ca.gov",
    
    "horiz_accuracy": "4 meters",
}


In [31]:
(esri2["ns0:MD_Metadata"]["ns0:contact"]
 ["ns0:CI_ResponsibleParty"]["ns0:contactInfo"]
 ["ns0:CI_Contact"]
)

{'ns0:address': {'ns0:CI_Address': {'ns0:electronicMailAddress': {'ns1:CharacterString': 'hello@calitp.org'}}}}

In [8]:
import metadata_update_pro
ROUTES_DICT = metadata_update_pro.fix_values_in_validated_dict(ROUTES_DICT) 

In [34]:
def overwrite_metadata_json(metadata_json: dict, 
                            dataset_info: dict) -> dict:
    d = dataset_info
    new_metadata = metadata_json.copy()
    
    # This prefix keeps coming up, but xmltodict has trouble processing or replacing it
    x = "ns0:"
    # This is how most values are keyed in for last dict
    key = "ns1:CharacterString"
    
    m = new_metadata[f"{x}MD_Metadata"]
    
    ## Identification Info
    id_info = m[f"{x}identificationInfo"][f"{x}MD_DataIdentification"]
    id_info[f"{x}abstract"][key] = d["abstract"]
    id_info[f"{x}purpose"][key] = d["purpose"]
    (id_info[f"{x}descriptiveKeywords"][1]
     [f"{x}MD_Keywords"][f"{x}keyword"]) = d["theme_keywords"]
    id_info[f"{x}topicCategory"][f"{x}MD_TopicCategoryCode"] = d["theme_topic"]
    id_info[f"{x}extent"][f"{x}EX_Extent"][f"{x}description"][key] = d["place"]

    
    citation_info = id_info[f"{x}citation"][f"{x}CI_Citation"]
    citation_info[f"{x}title"][key] = d["dataset_name"]
    citation_info[f"{x}date"][f"{x}CI_Date"][f"{x}date"]["ns1:Date"] = d["beginning_date"]
    
    status_info = id_info[f"{x}status"][f"{x}MD_ProgressCode"]
    status_info["codeListValue"] = d["status"]
    status_info["text"] = d["status"]
    
    maint_info = id_info[f"{x}resourceMaintenance"][f"{x}MD_MaintenanceInformation"]
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["codeListValue"]) = d["frequency"]
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["text"]) = d["frequency"]
    maint_info[f"{x}dateOfNextUpdate"]["ns1:Date"] = d["end_date"]
    
    extent_info = (id_info[f"{x}extent"][f"{x}EX_Extent"]
                   [f"{x}temporalElement"][f"{x}EX_TemporalExtent"]
                   [f"{x}extent"]["ns2:TimePeriod"])
 
    extent_info["ns2:beginPosition"] = d["beginning_date"] + "T00:00:00"
    extent_info["ns2:endPosition"] = d["end_date"] + "T00:00:00"

    
    ## Contact Info
    contact_info = m[f"{x}contact"][f"{x}CI_ResponsibleParty"]
    
    contact_info[f"{x}positionName"][key] = d["publish_entity"]
    contact_info[f"{x}organisationName"][key] = d["contact_organization"]
    contact_info[f"{x}individualName"][key] = d["contact_person"]
    
    (contact_info[f"{x}contactInfo"][f"{x}CI_Contact"]
     [f"{x}address"][f"{x}CI_Address"]
     [f"{x}electronicMailAddress"][key]) = d["contact_email"] 
    
    ## Data Quality
    data_qual_info = m[f"{x}dataQualityInfo"][f"{x}DQ_DataQuality"]
    (data_qual_info[f"{x}report"][f"{x}DQ_RelativeInternalPositionalAccuracy"]
     [f"{x}measureDescription"][key]) = d["horiz_accuracy"]
    
    (data_qual_info[f"{x}lineage"][f"{x}LI_Lineage"]
     [f"{x}processStep"][f"{x}LI_ProcessStep"]
     [f"{x}description"][key]) = d["methodology"]
    
    ## Need edition and resource contact added to be approved 
    # Add edition 
    # Use number instead of date (shows up when exported in FGDC)
    #NEW_EDITION = validation.check_edition_add_one(m)
    #m["idinfo"]["citation"]["citeinfo"]["edition"] = NEW_EDITION
                
    #m["eainfo"]["detailed"]["enttyp"]["enttypd"] = d["data_dict_type"]    
    #m["eainfo"]["detailed"]["enttyp"]["enttypds"] = d["data_dict_url"]    
      
    return new_metadata 


In [33]:
updated_meta = overwrite_metadata_json(esri2, DATASET_INFO_VALIDATED)

In [35]:
def overwrite_identification_info(metadata: dict, dataset_info: dict) -> dict:
    d = dataset_info
    # This prefix keeps coming up, but xmltodict has trouble processing or replacing it
    x = "ns0:"
    # This is how most values are keyed in for last dict
    key = "ns1:CharacterString"
    key_dt = "ns1:Date"
    
    ## Identification Info
    id_info = metadata[f"{x}identificationInfo"][f"{x}MD_DataIdentification"]
    
    id_info[f"{x}abstract"][key] = d["abstract"]
    id_info[f"{x}purpose"][key] = d["purpose"]
    (id_info[f"{x}descriptiveKeywords"][1]
     [f"{x}MD_Keywords"][f"{x}keyword"]) = d["theme_keywords"]
    id_info[f"{x}topicCategory"][f"{x}MD_TopicCategoryCode"] = d["theme_topic"]
    id_info[f"{x}extent"][f"{x}EX_Extent"][f"{x}description"][key] = d["place"]

    
    citation_info = id_info[f"{x}citation"][f"{x}CI_Citation"]
    citation_info[f"{x}title"][key] = d["dataset_name"]
    citation_info[f"{x}date"][f"{x}CI_Date"][f"{x}date"][key_dt] = d["beginning_date"]
    
    status_info = id_info[f"{x}status"][f"{x}MD_ProgressCode"]
    status_info["codeListValue"] = d["status"]
    status_info["text"] = d["status"]
    
    maint_info = id_info[f"{x}resourceMaintenance"][f"{x}MD_MaintenanceInformation"]
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["codeListValue"]) = d["frequency"]
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["text"]) = d["frequency"]
    maint_info[f"{x}dateOfNextUpdate"][key_dt] = d["end_date"]
    
    extent_info = (id_info[f"{x}extent"][f"{x}EX_Extent"]
                   [f"{x}temporalElement"][f"{x}EX_TemporalExtent"]
                   [f"{x}extent"]["ns2:TimePeriod"])
 
    extent_info["ns2:beginPosition"] = d["beginning_date"] + "T00:00:00"
    extent_info["ns2:endPosition"] = d["end_date"] + "T00:00:00"
    
    return metadata
    
    
def overwrite_contact_info(metadata: dict, dataset_info: dict) -> dict: 
    d = dataset_info
    x = "ns0:"
    key = "ns1:CharacterString"

    ## Contact Info
    contact_info = metadata[f"{x}contact"][f"{x}CI_ResponsibleParty"]
    
    contact_info[f"{x}positionName"][key] = d["publish_entity"]
    contact_info[f"{x}organisationName"][key] = d["contact_organization"]
    contact_info[f"{x}individualName"][key] = d["contact_person"]
    
    (contact_info[f"{x}contactInfo"][f"{x}CI_Contact"]
     [f"{x}address"][f"{x}CI_Address"]
     [f"{x}electronicMailAddress"][key]) = d["contact_email"] 
    
    return metadata


def overwrite_data_quality_info(metadata: dict, dataset_info: dict) -> dict:
    d = dataset_info
    x = "ns0:"
    key = "ns1:CharacterString"
    
    ## Data Quality
    data_qual_info = metadata[f"{x}dataQualityInfo"][f"{x}DQ_DataQuality"]
    (data_qual_info[f"{x}report"][f"{x}DQ_RelativeInternalPositionalAccuracy"]
     [f"{x}measureDescription"][key]) = d["horiz_accuracy"]
    
    (data_qual_info[f"{x}lineage"][f"{x}LI_Lineage"]
     [f"{x}processStep"][f"{x}LI_ProcessStep"]
     [f"{x}description"][key]) = d["methodology"]
    
    return metadata
    


In [36]:
def overwrite_metadata_json_condensed(metadata_json: dict, 
                            dataset_info: dict) -> dict:
    d = dataset_info
    new_metadata = metadata_json.copy()
    m = new_metadata[f"ns0:MD_Metadata"]
    
    m = overwrite_identification_info(m, d)
    m = overwrite_contact_info(m, d)
    m = overwrite_contact_info(m, d)

    ## Need edition and resource contact added to be approved 
    # Add edition 
    # Use number instead of date (shows up when exported in FGDC)
    #NEW_EDITION = validation.check_edition_add_one(m)
    #m["idinfo"]["citation"]["citeinfo"]["edition"] = NEW_EDITION
                
    #m["eainfo"]["detailed"]["enttyp"]["enttypd"] = d["data_dict_type"]    
    #m["eainfo"]["detailed"]["enttyp"]["enttypds"] = d["data_dict_url"]    
      
    return new_metadata 


In [37]:
DATASET_INFO_VALIDATED = metadata_update_pro.metadata_input(**ROUTES_DICT).dict()


In [38]:
updated_xml = overwrite_metadata_json_condensed(esri2, DATASET_INFO_VALIDATED)

In [39]:
updated_xml

{'ns0:MD_Metadata': {'xmlns:ns0': 'http://www.isotc211.org/2005/gmd',
  'xmlns:ns1': 'http://www.isotc211.org/2005/gco',
  'xmlns:ns2': 'http://www.opengis.net/gml/3.2',
  'ns0:language': {'ns0:LanguageCode': {'codeList': 'http://www.loc.gov/standards/iso639-2/php/code_list.php',
    'codeListValue': 'eng',
    'codeSpace': 'ISO639-2',
    'text': 'eng'}},
  'ns0:characterSet': {'ns0:MD_CharacterSetCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_CharacterSetCode',
    'codeListValue': 'utf8',
    'codeSpace': 'ISOTC211/19115',
    'text': 'utf8'}},
  'ns0:hierarchyLevel': {'ns0:MD_ScopeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_ScopeCode',
    'codeListValue': 'dataset',
    'codeSpace': 'ISOTC211/19115',
    'text': 'dataset'}},
  'ns0:hierarchyLevelName': {'ns1:CharacterString': 'dataset'},
  'ns0:contact': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'Tiffany Chu'},
 