# Morphe Metadata Graph - ACM Datasets

We want to load the fetched metadata from the ACM dataset, based on the author bios.

For now, let's stick with people. We will use the following `cpfRelationType` values:

    - "family" : This person is a family member to the current architect
    - "associative" : This entity or person is related to the current architect. Usually a collaboration!

Unused `cpfRelationType` values:

    - "subjectOf": somehow involved in this project
    - "creatorOf": somehow created this project.

In [5]:
import xml.etree.ElementTree as ET

# Register the namespace
namespaces = {
    'eac': 'urn:isbn:1-931666-33-4',
    'xlink': 'http://www.w3.org/1999/xlink'
}

# Parse the XML file or string
tree = ET.parse('Data/ACM/bios/alphonse-laverriere-bio.xml')  # or use ET.fromstring(xml_string) if you have it as a string
root = tree.getroot()

# Example: Get recordId
record_id = root.find('.//eac:recordId', namespaces)
print("Record ID:", record_id.text)

# Example: Get entity name
name_part = root.find('.//eac:nameEntry/eac:part', namespaces)
print("Name:", name_part.text)

# Example: Get the biographical history text
biog_hist = root.find('.//eac:biogHist/eac:p', namespaces)
print("Bio:", biog_hist.text.strip())

# Example: Get all related persons/entities
relations = root.findall('.//eac:cpfRelation', namespaces)
for rel in relations:
    name = rel.find('eac:relationEntry', namespaces)
    href = rel.attrib.get(f'{{{namespaces["xlink"]}}}href')
    print(f"Related to: {name.text}, link: {href}")


Record ID: Acm - Alphonse Laverrière
Name: Alphonse Laverrière
Bio: Architecte ENSBA (Atelier Pascal) FAS, fondateur de l’Œuvre, directeur de l’École cantonale de dessin à Lausanne, professeur de théorie de l’architecture au Polytechnicum de Zurich. L'activité variée qu'il a à Lausanne, à Genève et en Haute-Savoie lui permet d’afficher une carrière au cours de laquelle les programmes les plus prestigieux lui sont offerts: à Lausanne: Pont Chauderon, Gare CFF, Hôtel de la Paix, Librairie et éditions Payot, Siège de la Banque fédérale SA, Tribunal fédéral. Laverrière est remarqué pour son aptitude à gagner des concours, c’est ainsi qu’il réalise le Monument international de la réformation à Genève.
Membre FAS dès 1916.
Related to: Ferdinand Jacques Meyrat, link: https://morphe.epfl.ch/index.php/ferdinand-jacques-meyrat
Related to: Jean Taillens, link: https://morphe.epfl.ch/index.php/jean-taillens-et-charles-dubois
Related to: L'Œuvre (OEV), link: https://morphe.epfl.ch/index.php/loeuvre

chatgpt attempt

In [None]:
import os
import requests
import xml.etree.ElementTree as ET
import networkx as nx

# Register namespaces for eac-cpf
NS = {
    'eac': 'urn:isbn:1-931666-33-4',
    'xlink': 'http://www.w3.org/1999/xlink'
}


def parse_eac_cpf(xml_content):
    """
    Parse the XML content (string) of an EAC-CPF record.
    Returns:
      person_name (str): The main person's name in <nameEntry><part>
      relations (list of tuples): Each tuple is (relation_type, related_name, related_url)
                                  e.g. ('family', 'John Smith', 'https://morphe.epfl.ch/...')
    """
    root = ET.fromstring(xml_content)

    # Extract the main person's name
    name_el = root.find('.//eac:nameEntry/eac:part', NS)
    person_name = name_el.text.strip() if name_el is not None else "Unknown"
    print(f"Adding {person_name}'s relations.")

    relations = []
    # Find all <cpfRelation> elements
    cpf_relations = root.findall('.//eac:cpfRelation', NS)
    for rel in cpf_relations:
        rel_type = rel.attrib.get('cpfRelationType', '').strip()
        # For demonstration, let's fetch all. If you only want certain types, adapt the condition.
        if rel_type in ["family", "associative"] or True:
            rel_name_el = rel.find('eac:relationEntry', NS)
            print(f"    Adding person {rel_name_el} of rel_type {rel_type}.")
            rel_name = rel_name_el.text.strip() if rel_name_el is not None else "Unknown"
            xlink_href = rel.attrib.get(f'{{{NS["xlink"]}}}href', '')
            relations.append((rel_type, rel_name, xlink_href))

    return person_name, relations

def build_eac_url(href):
    """
    Given a base href like 'https://morphe.epfl.ch/index.php/paul-lavenex',
    build the URL needed to fetch the EAC-CPF XML by appending
    ';eac?sf_format=xml'.
    """
    return href + ";eac?sf_format=xml"

def add_person_to_graph(person_name):
    """
    Add a person node to the graph if it doesn't already exist.
    """
    if not graph.has_node(person_name):
        graph.add_node(person_name)

def add_relation_edge(source_name, target_name, relation_type):
    """
    Add an edge from source to target in the graph, labeled by relation_type,
    only if it doesn't already exist.
    """
    if not graph.has_edge(source_name, target_name):
        graph.add_edge(source_name, target_name, relation=relation_type)

def save_fetched_file(xml_url, content, directory):
    """
    Saves the fetched XML content to the given directory, using a filename derived from the URL.
    """
    # Make sure directory exists
    os.makedirs(directory, exist_ok=True)

    # Derive a safe filename from the URL
    filename = href_to_filename(xml_url)
    path = os.path.join(directory, filename)

    with open(path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"File saved: {path}")

def href_to_filename(url):
    """
    Convert the URL into a safe filename.
    Removes 'https://', '/', '?' and so forth, turning them into underscores.
    """
    import re
    base = re.sub(r'[^A-Za-z0-9._-]+', '_', url)
    return base + ".xml"

def process_record_from_url(xml_url, save_directory):
    """
    Recursively parse the EAC-CPF record from a URL, add the node/edges
    to the graph, then follow its relations. Avoid re-processing if visited.
    """
    # If we've already processed this link, return
    if xml_url in visited_links:
        return
    visited_links.add(xml_url)

    print(f"Fetching {xml_url}")
    resp = requests.get(xml_url)
    if resp.status_code != 200:
        print(f"WARNING: Could not fetch {xml_url} (status {resp.status_code})")
        return

    content = resp.text

    # Save the fetched XML to the directory
    save_fetched_file(xml_url, content, save_directory)

    # Parse the newly fetched XML
    person_name, relations = parse_eac_cpf(content)

    # Add the main person as a node
    add_person_to_graph(person_name)

    # For each relation, add the node and edge, then recurse
    for rel_type, rel_name, rel_href in relations:
        add_person_to_graph(rel_name)
        add_relation_edge(person_name, rel_name, rel_type)
        
        # If rel_href is non-empty, fetch it
        if rel_href:
            new_url = build_eac_url(rel_href)
            process_record_from_url(new_url, save_directory)

def process_local_file(filepath, save_directory):
    """
    Parse one local EAC-CPF XML file, add its person node,
    and follow its remote links recursively.
    """
    print(f"Processing local file: {filepath}")
    with open(filepath, "r", encoding="utf-8") as f:
        xml_content = f.read()

    # Parse
    person_name, relations = parse_eac_cpf(xml_content)
    add_person_to_graph(person_name)

    # For each relation, add node+edge, then fetch that link
    for rel_type, rel_name, rel_href in relations:
        add_person_to_graph(rel_name)
        add_relation_edge(person_name, rel_name, rel_type)
        if rel_href:
            new_url = build_eac_url(rel_href)
            process_record_from_url(new_url, save_directory)

def build_full_graph_from_directory(directory_path, save_directory):
    """
    Iterate over all XML files in the given directory. For each:
      - Parse the local file
      - Add its node & edges
      - Recursively fetch remote links and parse them
      - Save the fetched remote files to `save_directory`
    """
    for filename in os.listdir(directory_path):
        if filename.lower().endswith(".xml"):
            file_path = os.path.join(directory_path, filename)
            process_local_file(file_path, save_directory)

    print("Done building graph.")

In [17]:
    
# Create a global graph
graph = nx.Graph()

# To avoid re-fetching and re-processing the same links multiple times
visited_links = set()
    
input_directory = "Data/ACM/bios"       # your local directory of EAC-CPF files
fetched_save_dir = "Data/ACM/fetched_bios"   # directory where remote EAC-CPF files are stored

build_full_graph_from_directory(input_directory, fetched_save_dir)

print("\nFinal graph nodes:")
print(graph.nodes())

print("\nFinal graph edges:")
for u, v, data in graph.edges(data=True):
    print(f"  {u} --({data['relation']})--> {v}")

# For example, you can write it to a GraphML
nx.write_graphml(graph, "people_relations.graphml")
nx.write_gexf(graph, "people_relations.gexf")


Processing local file: Data/ACM/bios/jacques-de-freudenreich-bio.xml
Adding Jacques de Freudenreich's relations.
Processing local file: Data/ACM/bios/casimir-reymond-bio.xml
Adding Casimir Reymond's relations.
Processing local file: Data/ACM/bios/alexandre-antipas-bio.xml
Adding Alexandre Antipas's relations.
Processing local file: Data/ACM/bios/heidi-wenger-bio.xml
Adding Heidi Wenger's relations.
    Adding person <Element '{urn:isbn:1-931666-33-4}relationEntry' at 0x7fb0fae00bd0> of rel_type family.
Fetching https://morphe.epfl.ch/index.php/peter-wenger;eac?sf_format=xml
File saved: Data/ACM/fetched_bios/https_morphe.epfl.ch_index.php_peter-wenger_eac_sf_format_xml.xml
Adding Peter Wenger's relations.
    Adding person <Element '{urn:isbn:1-931666-33-4}relationEntry' at 0x7fb0fae01120> of rel_type family.
Fetching https://morphe.epfl.ch/index.php/heidi-wenger;eac?sf_format=xml
File saved: Data/ACM/fetched_bios/https_morphe.epfl.ch_index.php_heidi-wenger_eac_sf_format_xml.xml
Adding 

In [16]:
nx.degree_centrality(graph)

{'Jacques de Freudenreich': 0.0,
 'Casimir Reymond': 0.0,
 'Alexandre Antipas': 0.0,
 'Heidi Wenger': 0.006024096385542169,
 'Peter Wenger': 0.006024096385542169,
 'Charles-François Thévenaz': 0.012048192771084338,
 'Charles-Philippe Thévenaz': 0.012048192771084338,
 'Charles Thévenaz': 0.012048192771084338,
 'Jacques Nobile': 0.0,
 'Vincent Mangeat': 0.012048192771084338,
 'Pierre Foretay': 0.006024096385542169,
 'Fédération des Architectes Suisses (FAS) = Bund Schweizer Architekten (BSA)': 0.18072289156626506,
 'Renato Salvi': 0.01807228915662651,
 'Flora Ruchat-Roncati': 0.012048192771084338,
 'Communauté de Travail la Transjurane': 0.012048192771084338,
 'Georges Brera': 0.01807228915662651,
 'Paul Waltenspühl': 0.012048192771084338,
 'Jean-Marie Ellenberger': 0.024096385542168676,
 'Jean-Jacques Gerber': 0.006024096385542169,
 'Bernard Mocellin': 0.006024096385542169,
 'André Leman': 0.006024096385542169,
 'Max Schlup': 0.006024096385542169,
 'Jakob Zweifel': 0.024096385542168676,