In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xmltodict

from pydantic import BaseModel
from typing import List, Dict

import validation 

METADATA_FOLDER = "metadata_xml/"



In [2]:
import metadata_update_desktop

DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)

Loading XML as JSON from ./metadata_xml/default_pro.xml


Reminder to check out `uuidref`...if we keep lifting template, we would copy this part over.
```
default2 = default_template["ns0:MD_Metadata"]
(default2["ns0:identificationInfo"]
 ["ns0:MD_DataIdentification"]["ns0:descriptiveKeywords"][0]["ns0:MD_Keywords"]["ns0:thesaurusName"]
)
```

In [5]:
XML_FILE = f"./ca_transit_routes.xml"
esri_metadata = metadata_update_desktop.xml_to_json(XML_FILE)

Loading XML as JSON from ./ca_transit_routes.xml


In [6]:
def lift_necessary_dataset_elements(metadata_json: dict) -> dict:
    
    m = metadata_json["ns0:MD_Metadata"]
    
    # This namespace prefix is used a lot
    x = "ns0:"
    
    # Store this info in a dictionary
    d = {}
        
    # Date Stamp
    d[f"{x}dateStamp"] = m[f"{x}dateStamp"] 
    
    # Spatial Representation Info
    d[f"{x}spatialRepresentationInfo"] = m[f"{x}spatialRepresentationInfo"] 
   
    # Coordinate Reference System Info
    d[f"{x}referenceSystemInfo"] = m[f"{x}referenceSystemInfo"] 
    
    # Distribution Info
    d[f"{x}distributionInfo"] = m[f"{x}distributionInfo"]   
    
    return d

In [7]:
def overwrite_default_with_dataset_elements(metadata_json: dict) -> dict:
    DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
    default_template = metadata_update_desktop.xml_to_json(DEFAULT_XML)
    default = default_template["ns0:MD_Metadata"]
    
    # Grab the necessary elements from my dataset
    necessary_elements = lift_necessary_dataset_elements(metadata_json)
    
    # Overwrite it in the default template
    for key, value in default.items():
        if key in necessary_elements.keys():
            default[key] = necessary_elements[key] 
        else:
            default[key] = default[key]
            
    # Return the default template, but now with our dataset's info populated
    return default_template

In [8]:
esri2 = overwrite_default_with_dataset_elements(esri_metadata)

Loading XML as JSON from ./metadata_xml/default_pro.xml



### Now find all instances where what's inputted in BaseModel is used

In [None]:
class metadata_input(BaseModel):
    beginning_date: str
    end_date: str
    place: str = "California"
    status: str = "Complete"
    frequency: str = "Monthly"
    theme_topics: Dict
    data_dict_type: str
    data_dict_url: str

In [88]:
esri3 = esri2["ns0:MD_Metadata"]
x="ns0:"

(esri3["ns0:identificationInfo"]["ns0:MD_DataIdentification"]
 ["ns0:descriptiveKeywords"][1]["ns0:MD_Keywords"]
 ["ns0:keyword"]
)


list

In [76]:
# Add to class BaseModel
topic = "transportation"

In [84]:
keyword_list = fill_in_keyword_list(my_list)

In [80]:
def fill_in_keyword_list(keyword_list: list = []) -> list[dict]:
    if len(keyword_list) >= 5:
        filled_out_list =  [{"ns1:CharacterString": i} for i in keyword_list]
        return filled_out_list
    else:
        return "Input minimum 5 keywords"

In [20]:
esri3.keys()

dict_keys(['xmlns:ns0', 'xmlns:ns1', 'xmlns:ns2', 'ns0:language', 'ns0:characterSet', 'ns0:hierarchyLevel', 'ns0:hierarchyLevelName', 'ns0:contact', 'ns0:dateStamp', 'ns0:metadataStandardName', 'ns0:metadataStandardVersion', 'ns0:spatialRepresentationInfo', 'ns0:referenceSystemInfo', 'ns0:identificationInfo', 'ns0:distributionInfo', 'ns0:dataQualityInfo'])

In [None]:
#(esri3["ns0:identificationInfo"]["ns0:MD_DataIdentification"]
#)

In [90]:
def overwrite_metadata_json(metadata_json: dict, dataset_info: dict) -> dict:
    d = dataset_info
    new_metadata = metadata_json.copy()
    
    # This prefix keeps coming up, but xmltodict has trouble processing or replacing it
    x = "ns0:"
    # This is how most values are keyed in for last dict
    key = "ns1:CharacterString"
    
    m = new_metadata[f"{x}MD_Metadata"]
    
    ## Identification Info
    id_info = m[f"{x}identificationInfo"][f"{x}MD_DataIdentification"]
    id_info[f"{x}abstract"][key] = d["abstract"]
    id_info[f"{x}purpose"][key] = d["purpose"]
    (id_info[f"{x}descriptiveKeywords"][1]
     [f"{x}MD_Keywords"][f"{x}keyword"]) = d["theme_keywords"]
    

    
    citation_info = id_info[f"{x}citation"][f"{x}CI_Citation"]
    citation_info[f"{x}title"][key] = d["dataset_name"]
    citation_info[f"{x}date"][f"{x}CI_Date"][f"{x}date"]["ns1:Date"] = d["beginning_date"]
    
    status_info = id_info[f"{x}status"][f"{x}MD_ProgressCode"]
    status_info["codeListValue"] = d["status"]
    status_info["text"] = d["status"]
    
    maint_info = id_info[f"{x}resourceMaintenance"][f"{x}MD_MaintenanceInformation"]
    
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["codeListValue"]) = d["frequency"]
    (maint_info[f"{x}maintenanceAndUpdateFrequency"]
     [f"{x}MD_MaintenanceFrequencyCode"]["text"]) = d["frequency"]
    maint_info[f"{x}dateOfNextUpdate"]["ns1:Date"] = d["end_date"]
    
    ## Contact Info
    contact_info = m[f"{x}contact"][f"{x}CI_ResponsibleParty"]
    
    contact_info[f"{x}positionName"][key] = d["publish_entity"]
    contact_info[f"{x}organisationName"][key] = d["contact_organization"]
    contact_info[f"{x}individualName"][key] = d["contact_person"]
    
    (contact_info[f"{x}contactInfo"][f"{x}contactInfo"]
     [f"{x}CI_Contact"][f"{x}address"][f"{x}CI_Address"]
     [f"{x}electronicMailAddress"][key]) =  d["contact_email"] 
    
    ## Data Quality
    data_qual_info = m[f"{x}dataQualityInfo"][f"{x}DQ_DataQuality"]
    (data_qual_info[f"{x}report"][f"{x}DQ_RelativeInternalPositionalAccuracy"]
     [f"{x}measureDescription"][key]) = d["horiz_accuracy"]
    
    (data_qual_info[f"{x}lineage"][f"{x}LI_Lineage"]
     [f"{x}processStep"][f"{x}LI_ProcessStep"]
     [f"{x}description"][key]) = d["methodology"]
    
    ## Need edition and resource contact added to be approved 
    # Add edition 
    # Use number instead of date (shows up when exported in FGDC)
    NEW_EDITION = validation.check_edition_add_one(m)
    m["idinfo"]["citation"]["citeinfo"]["edition"] = NEW_EDITION
        
    m["idinfo"]["timeperd"]["current"] = d["place"]
    
    m["idinfo"]["keywords"] = d["theme_topics"]    

    m["idinfo"]["ptcontac"]["cntinfo"]["cntpos"] = d["publish_entity"]
        
    m["eainfo"]["detailed"]["enttyp"]["enttypl"] = d["dataset_name"]    
    m["eainfo"]["detailed"]["enttyp"]["enttypd"] = d["data_dict_type"]    
    m["eainfo"]["detailed"]["enttyp"]["enttypds"] = d["data_dict_url"]    
  
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntorg"] = d["contact_organization"]    
    m["metainfo"]["metc"]["cntinfo"]["cntorgp"]["cntper"] = d["contact_person"]    
    m["metainfo"]["metc"]["cntinfo"]["cntpos"] = d["publish_entity"]    
    m["metainfo"]["metc"]["cntinfo"]["cntemail"] = d["contact_email"]    
    
    return new_metadata 


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 18)