In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xmltodict

from pydantic import BaseModel
from typing import List, Dict

import validation 

METADATA_FOLDER = "metadata_xml/"



In [2]:
import metadata_update_desktop

DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)

Loading XML as JSON from ./metadata_xml/default_pro.xml


In [3]:
XML_FILE = f"./ca_transit_routes.xml"
esri_metadata = metadata_update_desktop.xml_to_json(XML_FILE)

Loading XML as JSON from ./ca_transit_routes.xml


In [4]:
def lift_necessary_dataset_elements(metadata_json: dict) -> dict:
    m = metadata_json["ns0:MD_Metadata"]
    
    # Store this info in a dictionary
    d = {}
        
    # Date Stamp
    d["ns0:dateStamp"] = m["ns0:dateStamp"] 
    
    # Spatial Representation Info
    d["ns0:spatialRepresentationInfo"] = m["ns0:spatialRepresentationInfo"] 
   
    # Coordinate Reference System Info
    d["ns0:referenceSystemInfo"] = m["ns0:referenceSystemInfo"] 
    
    # Distribution Info
    d["ns0:distributionInfo"] = m["ns0:distributionInfo"]   
        
    # Identification info
    d["ns0:identificationInfo"] = m["ns0:identificationInfo"]
    
    return d

In [5]:
def overwrite_default_with_dataset_elements(metadata_json: dict) -> dict:
    DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
    default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)
    default = default_template["ns0:MD_Metadata"]
    
    # Grab the necessary elements from my dataset
    necessary_elements = lift_necessary_dataset_elements(metadata_json)
    
    # Overwrite it in the default template
    for key, value in default.items():
        if key in necessary_elements.keys():
            default[key] = necessary_elements[key]     
            
    # Return the default template, but now with our dataset's info populated
    return default_template

In [6]:
default2 = overwrite_default_with_dataset_elements(esri_metadata)

Loading XML as JSON from ./metadata_xml/default_pro.xml


### Now find all instances where what's inputted in BaseModel is used

In [None]:
class metadata_input(BaseModel):
    beginning_date: str
    end_date: str
    place: str = "California"
    status: str = "Complete"
    frequency: str = "Monthly"
    theme_topics: Dict
    methodology: str
    data_dict_type: str
    data_dict_url: str
    contact_organization: str = "Caltrans"
    contact_person: str
    contact_email: str = "hello@calitp.org"
    horiz_accuracy: str = "4 meters"

In [7]:
default3 = default2["ns0:MD_Metadata"]

In [35]:
default3["ns0:contact"]["ns0:CI_ResponsibleParty"]

{'ns0:individualName': {'ns1:CharacterString': 'My Name'},
 'ns0:organisationName': {'ns1:CharacterString': 'Caltrans'},
 'ns0:positionName': {'ns1:CharacterString': 'California Integrated Travel Project'},
 'ns0:contactInfo': {'ns0:CI_Contact': {'ns0:address': {'ns0:CI_Address': {'ns0:electronicMailAddress': {'ns1:CharacterString': 'hello@calitp.org'}}}}},
 'ns0:role': {'ns0:CI_RoleCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#CI_RoleCode',
   'codeListValue': 'publisher',
   'codeSpace': 'ISOTC211/19115',
   'text': 'publisher'}}}

In [36]:
#default3

In [37]:
#(default3["ns0:identificationInfo"]["ns0:MD_DataIdentification"]
#)

In [None]:
def overwrite_metadata_json(metadata_json: dict, dataset_info: dict) -> dict:
    d = dataset_info
    new_metadata = metadata_json.copy()
    
    # This prefix keeps coming up, but xmltodict has trouble processing or replacing it
    x = "ns0:"
    # This is how most values are keyed in for last dict
    key = "ns1:CharacterString"
    
    m = new_metadata[f"{x}MD_Metadata"]
    
    ## Identification Info
    id_info = m[f"{x}identificationInfo"][f"{x}MD_DataIdentification"]
    
    id_info[f"{x}citation"][f"{x}CI_Citation"][f"{x}title"][key] = d["dataset_name"]
    id_info[f"{x}abstract"][key] = d["abstract"]
    id_info[f"{x}purpose"][key] = d["purpose"]

    
    ## Contact info
    contact_info = m[f"{x}contact"]["ns0:CI_ResponsibleParty"]
    
    contact_info[f"{x}positionName"][key] = d["publish_entity"]
    
    ## Need edition and resource contact added to be approved 
    # Add edition 
    # Use number instead of date (shows up when exported in FGDC)
    NEW_EDITION = validation.check_edition_add_one(m)
    m["idinfo"]["citation"]["citeinfo"]["edition"] = NEW_EDITION
        
    m["idinfo"]["timeperd"]["timeinfo"]["rngdates"]["begdate"] = d["beginning_date"]
    m["idinfo"]["timeperd"]["timeinfo"]["rngdates"]["enddate"] = d["end_date"]
    m["idinfo"]["timeperd"]["current"] = d["place"]
    
    m["idinfo"]["status"]["progress"] = d["status"]
    m["idinfo"]["status"]["update"] = d["frequency"]

    m["idinfo"]["keywords"] = d["theme_topics"]    

    m["idinfo"]["ptcontac"]["cntinfo"]["cntorgp"]["cntorg"] = d["contact_organization"]
    m["idinfo"]["ptcontac"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]
    m["idinfo"]["ptcontac"]["cntinfo"]["cntpos"] = d["publish_entity"]
    m["idinfo"]["ptcontac"]["cntinfo"]["cntemail"] = d["contact_email"]    
    
    m["dataqual"]["posacc"]["horizpa"]["horizpar"] = d["horiz_accuracy"]
    m["dataqual"]["lineage"]["procstep"]["procdesc"] = d["methodology"]    
    
    m["eainfo"]["detailed"]["enttyp"]["enttypl"] = d["dataset_name"]    
    m["eainfo"]["detailed"]["enttyp"]["enttypd"] = d["data_dict_type"]    
    m["eainfo"]["detailed"]["enttyp"]["enttypds"] = d["data_dict_url"]    
  
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntorg"] = d["contact_organization"]    
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]    
    m["metainfo"]["metc"]["cntinfo"]["cntpos"] = d["publish_entity"]    
    m["metainfo"]["metc"]["cntinfo"]["cntemail"] = d["contact_email"]    
    
    return new_metadata 
