## 0 Setting

In [1]:
import sys

codefolder = "C:/ProjectCollections/Programs/Australia_Cultural_Data_Engine/codes"
data_folder = "D:/Program_Data/Australia_Cultural_Data_Engine_Data"

sys.path.append(codefolder)
import os
import pprint
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict

from acde import MongoDBManipulation as acde_manip
from general import GeneralFunctions as gen_gf
from general import JsonProcessing as gen_jp
from general import MongoDBManipulation as gen_manip
from tqdm import tqdm

pp = pprint.PrettyPrinter(indent=2)
acde_opr = acde_manip.ACDE_Manipulation()
acde_db = acde_opr.acde_db

## 1 Parse XML file

In [2]:
def parse_element(element):
    d = {}
    if element.text:
        d[element.tag] = element.text
    for child in element:
        if child.tag not in d:
            d[child.tag] = []
        d[child.tag].append(parse_element(child))
    for k, v in element.items():
        if k not in d:
            d[k] = v
    return d


def parse_xml_file(xml_file):
    result = []
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for record in root:
        result.append(parse_element(record))
    return result


# parse the XML file
results = parse_xml_file(
    os.path.join(
        data_folder,
        "summerhayes_family_collection",
        "BIBLIOGRAPHIC_Summerhayes Architecture Collection.xml",
    )
)

In [3]:
# initialize the counter of MARC codes
marc_code_counter = Counter()

# flatten nested parsed results
flatten_results = []
structural_results = []
for result in results:
    structural_result = result["leader"][0]
    flatten_result = result["leader"][0]
    structural_result["controlfield"] = {}
    structural_result["datafield"] = []
    for cf in result["controlfield"]:
        structural_result["controlfield"][cf.get("tag")] = cf.get("controlfield")
        marc_code_counter.update([cf.get("tag")])
    for datafield in result["datafield"]:
        for subfield in datafield["subfield"]:
            marc_code = f"{datafield['tag']}-{datafield['ind1'] if datafield['ind1'].strip() else '#'}-{datafield['ind2'] if datafield['ind2'].strip() else '#'}_${subfield['code']}"
            structural_result["datafield"].append((marc_code, subfield["subfield"]))
            marc_code_counter.update([marc_code])
    flatten_result.update(structural_result["controlfield"])
    flatten_result.update(structural_result["datafield"])
    flatten_results.append(flatten_result)
    structural_results.append(structural_result)

marc_code_counter.most_common()

[('035-#-#_$a', 480),
 ('852-#-#_$g', 440),
 ('852-#-#_$3', 440),
 ('852-#-#_$c', 440),
 ('852-#-#_$z', 440),
 ('852-#-#_$q', 439),
 ('942-#-#_$a', 243),
 ('900-#-#_$t', 242),
 ('900-#-#_$b', 242),
 ('008', 241),
 ('005', 241),
 ('001', 241),
 ('FMT', 240),
 ('910-#-#_$t', 238),
 ('700-1-#_$a', 236),
 ('506-#-#_$a', 233),
 ('300-#-#_$a', 232),
 ('245-0-0_$a', 231),
 ('920-#-#_$t', 231),
 ('541-#-#_$a', 228),
 ('599-#-#_$a', 225),
 ('787-#-#_$o', 209),
 ('260-#-#_$c', 201),
 ('092-4-#_$a', 161),
 ('930-#-#_$t', 161),
 ('500-#-#_$a', 72),
 ('092-3-#_$a', 71),
 ('595-#-#_$a', 66),
 ('520-2-#_$a', 15),
 ('260-#-#_$a', 13),
 ('856-#-#_$u', 6),
 ('092-2-#_$a', 6),
 ('245-0-2_$a', 5),
 ('590-#-#_$a', 4),
 ('092-1-#_$a', 2),
 ('245-1-0_$a', 2),
 ('545-0-#_$a', 2),
 ('245-0-3_$a', 1),
 ('245-0-#_$a', 1),
 ('710-2-#_$a', 1),
 ('100-1-#_$a', 1),
 ('245-0-4_$a', 1)]

## 2 Project Selected Attributes to ACDE

In [4]:
marc_definition = [
    ("leader", ""),
    ("035-#-#_$a", "System control number (NR)", "identifier_info.sys_control_number"),
    ("852-#-#_$g", "Non-coded location qualifier (R)"),
    ("852-#-#_$3", "Materials specified (NR)", "acquisition_info.material"),
    ("852-#-#_$c", "Shelving location (R)", "acquisition_info.shelving_loc"),
    ("852-#-#_$z", "Public note (R)", "acquisition_info.availability"),
    ("852-#-#_$q", "Piece physical condition (NR)", "acquisition_info.condition"),
    (
        "942-#-#_$a",
        "Source of classification or shelving scheme",
        "acquisition_info.shelving_schm",
    ),
    (
        "900-#-#_$t",
        "Curtin University Library/John Curtin Prime Ministerial Library Identifier",
        "identifier_info.local_lib_ident",
    ),
    ("900-#-#_$b", "Title of Collection.", "title_collection"),
    ("008", "General Information"),
    ("005", "Date and time of latest transaction"),
    ("001", 241),
    ("FMT", 240),
    ("910-#-#_$t", 'Creator - "Records of First name Surname".'),
    (
        "700-1-#_$a",
        'Creator Person - "Surname, First name"',
        "authoring_info.authors.name",
    ),
    ("506-#-#_$a", "Restrict. Note", "acquisition_info.restriction"),
    ("300-#-#_$a", "Physical Description", "physical_description"),
    ("245-0-0_$a", "Title_00", "title"),
    ("920-#-#_$t", "File - Same as Series title.", "title_series"),
    (
        "541-#-#_$a",
        "Acquired source - 'Surname, Initial'.",
        "source_info.publisher.name",
    ),
    ("599-#-#_$a", "Category", "types.primary_type"),
    ("787-#-#_$o", "Other item identifier", "identifier_info.other_item_ident"),
    ("260-#-#_$c", "Date of publication, distribution"),
    ("092-4-#_$a", "Item accession number", "identifier_info.item_accession_ident"),
    ("930-#-#_$t", "File - Same as File title.", "title_file"),
    ("500-#-#_$a", "General note", "note"),
    ("092-3-#_$a", "File Accession identifier", "identifier_info.file_accession_ident"),
    ("595-#-#_$a", "Has items", "acquisition_info.has_items"),
    ("520-2-#_$a", "Scope and content - Summary"),
    ("260-#-#_$a", "Place of publication, distribution"),
    ("856-#-#_$u", "Uniform Resource Identifier (R)"),
    ("092-2-#_$a", "Series accession number", "identifier_info.series_accession_ident"),
    ("245-0-2_$a", "Title_02", "title"),
    ("590-#-#_$a", "Internal note", "note_internal"),
    (
        "092-1-#_$a",
        "Creator accession number",
        "identifier_info.creator_accession_ident",
    ),
    ("245-1-0_$a", "Title_10", "title"),
    ("545-0-#_$a", "Bio/History note", "note_history"),
    ("245-0-3_$a", "Title_03", "title"),
    ("245-0-#_$a", "Title_0#", "title"),
    (
        "710-2-#_$a",
        "Name in direct order - Corporate name or jurisdiction name as entry element (NR)",
    ),
    ("100-1-#_$a", 'Creator-Person - "Surname, First name."'),
    ("245-0-4_$a", "Title_04", "title"),
]

marc_projection = {}
for attr_info in marc_definition:
    if len(attr_info) == 3:
        marc_projection[attr_info[0]] = attr_info[2]

In [5]:
curr_lvl = "resource"
acde_db[curr_lvl].delete_many({"data_source": "SFC"})

with tqdm(
    total=len(flatten_results), desc="Loading SFC resource data to ACDE database..."
) as pbar:
    for flatten_result in flatten_results:
        proj_result = defaultdict(dict)
        for ori_attr_n, acde_attr_k in marc_projection.items():
            acde_attr_v = flatten_result.get(ori_attr_n)
            if acde_attr_v:
                keys = acde_attr_k.split(".")
                part_num = len(keys)
                if part_num == 1:
                    proj_result[acde_attr_k] = acde_attr_v
                elif part_num == 2:
                    proj_result[keys[0]][keys[1]] = acde_attr_v
                    if keys[1] == "item_accession_ident":
                        proj_result["_class_ori"] = "item"
                    elif keys[1] == "file_accession_ident":
                        proj_result["_class_ori"] = "file"
                    elif keys[1] == "series_accession_ident":
                        proj_result["_class_ori"] = "series"
                    elif keys[1] == "creator_accession_ident":
                        proj_result["_class_ori"] = "creator"
                    if keys[0] == "types":
                        proj_result[keys[0]] = {keys[1]: acde_attr_v}
                elif part_num == 3:
                    proj_result[keys[0]][keys[1]] = {}
                    proj_result[keys[0]][keys[1]][keys[2]] = acde_attr_v
        proj_result.update({"data_source": "SFC", "_class": "resource"})
        if proj_result.get("_class_ori") is None:
            proj_result.update({"_class_ori": "collection"})
        try:
            # insert new record
            acde_db[curr_lvl].insert_one(proj_result)
        except Exception as e:
            print(e)
            print(flatten_result)
        pbar.update(1)

Loading SFC resource data to ACDE database...: 100%|███████████████████████████████| 241/241 [00:00<00:00, 1695.04it/s]
