In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xmltodict

from pydantic import BaseModel
from typing import List, Dict

import metadata_update_pro
import validation_pro 

METADATA_FOLDER = "metadata_xml/"



In [2]:
DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
default_template = metadata_update_pro.xml_to_json(DEFAULT_XML)

Loading XML as JSON from ./metadata_xml/default_pro.xml


Reminder to check out `uuidref`...if we keep lifting template, we would copy this part over.
```
default2 = default_template["ns0:MD_Metadata"]
(default2["ns0:identificationInfo"]
 ["ns0:MD_DataIdentification"]["ns0:descriptiveKeywords"][0]["ns0:MD_Keywords"]["ns0:thesaurusName"]
)
```

In [3]:
XML_FILE = f"./ca_transit_routes.xml"
esri_metadata = metadata_update_pro.xml_to_json(XML_FILE)

Loading XML as JSON from ./ca_transit_routes.xml


In [4]:
def lift_necessary_dataset_elements(metadata_json: dict) -> dict:
    
    m = metadata_json["ns0:MD_Metadata"]
    
    # This namespace prefix is used a lot
    x = "ns0:"
    
    # Store this info in a dictionary
    d = {}
        
    # Date Stamp
    d[f"{x}dateStamp"] = m[f"{x}dateStamp"] 
    
    # Spatial Representation Info
    d[f"{x}spatialRepresentationInfo"] = m[f"{x}spatialRepresentationInfo"] 
   
    # Coordinate Reference System Info
    d[f"{x}referenceSystemInfo"] = m[f"{x}referenceSystemInfo"] 
    
    # Distribution Info
    d[f"{x}distributionInfo"] = m[f"{x}distributionInfo"]   
    
    return d

In [5]:
def overwrite_default_with_dataset_elements(metadata_json: dict) -> dict:
    DEFAULT_XML = f"./{METADATA_FOLDER}default_pro.xml"
    default_template = metadata_update_pro.xml_to_json(DEFAULT_XML)
    default = default_template["ns0:MD_Metadata"]
    
    # Grab the necessary elements from my dataset
    necessary_elements = lift_necessary_dataset_elements(metadata_json)
    
    # Overwrite it in the default template
    for key, value in default.items():
        if key in necessary_elements.keys():
            default[key] = necessary_elements[key] 
        else:
            default[key] = default[key]
            
    # Return the default template, but now with our dataset's info populated
    return default_template

In [6]:
esri2 = overwrite_default_with_dataset_elements(esri_metadata)

Loading XML as JSON from ./metadata_xml/default_pro.xml


In [None]:
(esri2["ns0:MD_Metadata"]["ns0:identificationInfo"]
 ["ns0:MD_DataIdentification"]["ns0:citation"]
 ["ns0:CI_Citation"]["ns0:edition"]
)


### Now find all instances where what's inputted in BaseModel is used

In [7]:
ROUTES_DICT = {
    "dataset_name": "ca_transit_routes", 
    "publish_entity": "California Integrated Travel Project", 

    "abstract": "Public. EPSG: 4326",
    "purpose": "my purpose", # purpose, also repeat for description?

    "beginning_date": "2022/09/14",
    "end_date": "2022-10-14",
    "place": "California",

    "status": "completed", 
    "frequency": "monthly",
    
    "theme_keywords": ["1", "2", "3", "4", "5"], #tags

    "methodology": "this is my methodology", 
    "contact_organization": "Caltrans", 
    "contact_person": "Tiffany Chu", 
    "contact_email": "tiffany.chu@dot.ca.gov",
    
    "horiz_accuracy": "4 meters",
}


In [None]:
(esri2["ns0:MD_Metadata"]["ns0:contact"]
 ["ns0:CI_ResponsibleParty"]["ns0:contactInfo"]
 ["ns0:CI_Contact"]
)

In [8]:
ROUTES_DICT = metadata_update_pro.fix_values_in_validated_dict(ROUTES_DICT) 

In [9]:
DATASET_INFO_VALIDATED = metadata_update_pro.metadata_input(**ROUTES_DICT).dict()

In [10]:
updated_meta = metadata_update_pro.overwrite_metadata_json(esri2, DATASET_INFO_VALIDATED)

In [11]:
updated_meta

{'ns0:MD_Metadata': {'xmlns:ns0': 'http://www.isotc211.org/2005/gmd',
  'xmlns:ns1': 'http://www.isotc211.org/2005/gco',
  'xmlns:ns2': 'http://www.opengis.net/gml/3.2',
  'ns0:language': {'ns0:LanguageCode': {'codeList': 'http://www.loc.gov/standards/iso639-2/php/code_list.php',
    'codeListValue': 'eng',
    'codeSpace': 'ISO639-2',
    'text': 'eng'}},
  'ns0:characterSet': {'ns0:MD_CharacterSetCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_CharacterSetCode',
    'codeListValue': 'utf8',
    'codeSpace': 'ISOTC211/19115',
    'text': 'utf8'}},
  'ns0:hierarchyLevel': {'ns0:MD_ScopeCode': {'codeList': 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml#MD_ScopeCode',
    'codeListValue': 'dataset',
    'codeSpace': 'ISOTC211/19115',
    'text': 'dataset'}},
  'ns0:hierarchyLevelName': {'ns1:CharacterString': 'dataset'},
  'ns0:contact': {'ns0:CI_ResponsibleParty': {'ns0:individualName': {'ns1:CharacterString': 'Tiffany Chu'},
 