In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = 'E:/DH/internship/dataset/combined.xml'  # Update this path to your XML file location
tree = ET.parse(file_path)
root = tree.getroot()

# Define mappings for entity types
entity_type_mapping = {
    "AE": "archive",
    "EV": "event",
    "OB": "building",
    "OR": "organization",
    "PS": "person"
}

# Initialize lists for nodes and edges
nodes = []
edges = []

def refined_append_node(id, name, entity_type):
    if not any(node['id'] == id for node in nodes):
        nodes.append({"id": id, "name": name, "type": entity_type})

def refined_append_edge(source_id, target_id, relationship_type):
    edges.append({"source": source_id, "target": target_id, "relationship": relationship_type})

def process_relationships(entity_id, relation_section):
    for relation in relation_section:
        rel_entity_type = relation.find('ENTITEIT').text if relation.find('ENTITEIT') is not None else "Unknown Rel Type"
        rel_entity_id = relation.find('PS_ID_RELATIE').text if relation.find('PS_ID_RELATIE') is not None else "Unknown Rel ID"
        rel_entity_name = relation.find('PS_RELATIE').text if relation.find('PS_RELATIE') is not None else "Unknown Rel Name"
        rel_type = relation.find('PS_RELATIE_TYPE').text if relation.find('PS_RELATIE_TYPE') is not None else "Unknown Relationship"
        
        # Append related entity as node
        refined_append_node(rel_entity_id, rel_entity_name, rel_entity_type.lower())
        
        # Append relationship as edge
        refined_append_edge(entity_id, rel_entity_id, rel_type)

# Main logic to extract nodes and edges
for row in root.findall('.//ROW'):
    entity_id = row.find('ID').text if row.find('ID') is not None else "Unknown ID"
    entity_type = entity_type_mapping.get(row.find('RUBRIEK').text, "unknown") if row.find('RUBRIEK') is not None else "Unknown Type"
    entity_description = row.find('OMSCHRIJVING').text if row.find('OMSCHRIJVING') is not None else "No Description"
    
    # Append main entity as node
    refined_append_node(entity_id, entity_description, entity_type)
    
    # Iterate over potential relationship sections
    for relation_section in row:
        if '_RL_' in relation_section.tag:
            process_relationships(entity_id, row.findall(f'.//{relation_section.tag}'))

# Convert lists to DataFrames
nodes_df = pd.DataFrame(nodes).drop_duplicates()
edges_df = pd.DataFrame(edges).drop_duplicates()

# Optionally, save to CSV or another format for further analysis or visualization
nodes_df.to_csv('nodes.csv', index=False)
edges_df.to_csv('edges.csv', index=False)

print("Nodes and edges have been extracted and saved.")


Nodes and edges have been extracted and saved.


In [8]:
nodes_df.head()

Unnamed: 0,id,name,type
0,10203,bestand / archief: Archief van de parochie Onz...,archive
1,258,bestand / archief: Archief Jean-Luc Dehaene. 1...,archive
2,2502,Herdenking van het eeuwfeest van de Boerenkrij...,event
3,1318,"Sacramentsprocessie, Eksaarde",event
4,2195,Vijfde Katholiek Congres van Mechelen (23-29 s...,event


In [3]:
edges_df.head()

In [5]:
nodes_df.shape

(19, 3)

In [4]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = 'E:/DH/internship/dataset/combined.xml'  # Adjust this to your actual file path
tree = ET.parse(file_path)
root = tree.getroot()

# Initialize lists for nodes and edges
nodes = []
edges = []

# Define mappings for entity types
entity_type_mapping = {
    "AE": "archive",
    "EV": "event",
    "OB": "building",
    "OR": "organization",
    "PS": "person"
}

# Function to add a node if it's not already present
def add_node(node_id, node_name, node_type):
    if not any(node['id'] == node_id for node in nodes):
        nodes.append({"id": node_id, "name": node_name, "type": node_type})

# Function to add an edge
def add_edge(source, target, relationship):
    edges.append({"source": source, "target": target, "relationship": relationship})

# Function to safely get text from an element
def safe_get_text(element, tag):
    child = element.find(tag)
    return child.text if child is not None else None

# Extract entities and their relationships
for row in root.findall('.//ROW'):
    # Extract main entity information
    main_entity_id = safe_get_text(row, 'ID')
    main_entity_type = entity_type_mapping.get(safe_get_text(row, 'RUBRIEK'), 'unknown')  # Use mapping
    main_entity_description = safe_get_text(row, './/OMSCHRIJVING') or 'No Description'
    
    # Add main entity as a node
    add_node(main_entity_id, main_entity_description, main_entity_type)

    # Find all relationships for the main entity
    for relationship_tag in ['EV_RL_EV_OR_ROW', 'EV_RL_EV_EV_ROW', 'EV_RL_EV_OB_ROW', 'EV_RL_EV_PS_ROW', 'EV_RL_EV_AE_ROW', 'AE_RL_AE_OR_ROW', 'AE_RL_AE_PS_ROW', 'AE_RL_AE_EV_ROW', 'AE_RL_AE_OB_ROW', 'AE_RL_AE_AE_ROW', 'PS_RL_PS_PS_ROW', 'PS_RL_PS_OR_ROW', 'PS_RL_PS_OB_ROW', 'PS_RL_PS_EV_ROW', 'PS_RL_PS_AE_ROW', 'OB_RL_OB_OB_ROW', 'OB_RL_OB_PS_ROW', 'OB_RL_OB_EV_ROW', 'OB_RL_OB_AE_ROW', 'OB_RL_OB_OR_ROW', 'OR_RL_OR_OR_ROW', 'OR_RL_OR_PS_ROW', 'OR_RL_OR_EV_ROW', 'OR_RL_OR_AE_ROW', 'OR_RL_OR_OB_ROW']:
        for rel_row in row.findall(f'.//{relationship_tag}'):
            rel_entity_id = safe_get_text(rel_row, 'ID_RELATIE')
            rel_entity_description = safe_get_text(rel_row, 'RELATIE')
            # Determine the correct type for related entities
            rel_entity_type_abbreviation = relationship_tag.split('_')[2][:2]  # Get the type abbreviation from the tag
            rel_entity_type = entity_type_mapping.get(rel_entity_type_abbreviation, 'unknown')

            # Add related entity as a node with corrected type
            add_node(rel_entity_id, rel_entity_description, rel_entity_type)

            # Add the relationship as an edge
            add_edge(main_entity_id, rel_entity_id, relationship_tag)

# Convert nodes and edges to DataFrames
nodes_df = pd.DataFrame(nodes)
edges_df = pd.DataFrame(edges)

# Update the paths below to where you want to save the CSV files
nodes_df.to_csv('nodes_corrected.csv', index=False)
edges_df.to_csv('edges_corrected.csv', index=False)

# Display the heads and shapes of the dataframes for verification
print(nodes_df.head(), edges_df.head(), nodes_df.shape, edges_df.shape)


      id                                               name     type
0  10203  bestand / archief: Archief van de parochie Onz...  archive
1   None                                               None  archive
2    258  bestand / archief: Archief Jean-Luc Dehaene. 1...  archive
3   2502  Herdenking van het eeuwfeest van de Boerenkrij...    event
4   1318                      Sacramentsprocessie, Eksaarde    event   source target     relationship
0  10203   None  AE_RL_AE_OR_ROW
1  10203   None  AE_RL_AE_PS_ROW
2  10203   None  AE_RL_AE_PS_ROW
3    258   None  AE_RL_AE_AE_ROW
4    258   None  AE_RL_AE_AE_ROW (20, 3) (436, 3)


In [23]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = 'E:/DH/internship/dataset/combined.xml'  # Replace with your actual file path
tree = ET.parse(file_path)
root = tree.getroot()

# Initialize lists for nodes and edges
nodes = []
edges = []

# Define mappings for entity types
entity_type_mapping = {
    "AE": "archive",
    "EV": "event",
    "OB": "building",
    "OR": "organization",
    "PS": "person",
    "PB": "publication",
    "FM": "families",
    "ORG": "organization",
    "PERS": "person"
}

# Function to add a node if it's not already present
def add_node(node_id, node_name, node_type, from_date, utill_date, dead_year):
    if not any(node['ID'] == node_id for node in nodes):
        nodes.append({"ID": node_id, "Label": node_name, "Type": node_type, "Van Datum": from_date, "Tot Datum": utill_date, "Dead Year": dead_year})

# Function to add an edge
def add_edge(source, target, relationship, start_date, end_date):
    if not any((edge['Source'] == source and edge['Target'] == target) for edge in edges):
        edges.append({"Source": source, "Target": target, "Relationship": relationship, "Van Datum": start_date, "Tot Datum": end_date})

# Function to safely get text from an element
def safe_get_text(element, tag):
    child = element.find(tag)
    return child.text if child is not None else None

# Extract entities and their relationships
for row in root.findall('.//ROW'):
    # Extract main entity information
    main_entity_id = safe_get_text(row, 'ID')
    main_entity_type = entity_type_mapping.get(safe_get_text(row, 'RUBRIEK'))
    main_entity_description = safe_get_text(row, './/OMSCHRIJVING') or 'No Description'
    main_entity_start_year = safe_get_text(row, 'SYST_VAN_DATUM')
    main_entity_end_year = safe_get_text(row, 'SYST_TOT_DATUM')
    main_dead_year = ''
        


    # Add main entity as a node
    add_node(main_entity_id, main_entity_description, main_entity_type,main_entity_start_year, main_entity_end_year, main_dead_year)

    # Find all relationships for the main entity
    for relationship_tag in ['AE_ARCHIEFVORMER_ROW', 'EV_RL_EV_OR_ROW', 'EV_RL_EV_EV_ROW', 'EV_RL_EV_OB_ROW', 'EV_RL_EV_PS_ROW', 'EV_RL_EV_AE_ROW', 'EV_RL_EV_PB_ROW', 'EV_RL_EV_FM_ROW', 'AE_RL_AE_OR_ROW', 'AE_RL_AE_PS_ROW', 'AE_RL_AE_EV_ROW', 'AE_RL_AE_OB_ROW', 'AE_RL_AE_AE_ROW', 'AE_RL_AE_PB_ROW', 'AE_RL_AE_FM_ROW', 'PS_RL_PS_PS_ROW','PS_RL_PS_OR_ROW', 'PS_RL_PS_PB_ROW','PS_RL_PS_OB_ROW', 'PS_RL_PS_EV_ROW', 'PS_RL_PS_AE_ROW', 'PS_RL_PS_FM_ROW', 'OB_RL_OB_OB_ROW', 'OB_RL_OB_PS_ROW', 'OB_RL_OB_EV_ROW', 'OB_RL_OB_AE_ROW', 'OB_RL_OB_OR_ROW', 'OB_RL_OB_PB_ROW', 'OB_RL_OB_FM_ROW', 'OR_RL_OR_OR_ROW', 'OR_RL_OR_PS_ROW', 'OR_RL_OR_EV_ROW', 'OR_RL_OR_AE_ROW', 'OR_RL_OR_OB_ROW', 'OR_RL_OR_PB_ROW', 'OR_RL_OR_FM_ROW']:
        for rel_row in row.findall(f'.//{relationship_tag}'):
            # Extract related entity information
            rel_entity_id = safe_get_text(rel_row, 'ID') or safe_get_text(rel_row, 'EV_ID_RELATIE') or safe_get_text(rel_row, 'ORG_ID_RELATIE') or safe_get_text(rel_row, 'OR_ID_RELATIE') or safe_get_text(rel_row, 'OB_ID_RELATIE') or safe_get_text(rel_row, 'PS_ID_RELATIE') or safe_get_text(rel_row, 'PB_ID_RELATIE') or safe_get_text(rel_row, 'FM_ID_RELATIE') or safe_get_text(rel_row, 'AE_ID_RELATIE')
            rel_entity_description = safe_get_text(rel_row, 'OMSCHRIJVING') or safe_get_text(rel_row, 'EV_RELATIE') or safe_get_text(rel_row, 'PS_RELATIE') or safe_get_text(rel_row, 'ORG_RELATIE') or safe_get_text(rel_row, 'OR_RELATIE') or safe_get_text(rel_row, 'OB_RELATIE') or safe_get_text(rel_row, 'PB_RELATIE') or safe_get_text(rel_row, 'FM_RELATIE') or safe_get_text(rel_row, 'AE_RELATIE')
            relation_type = safe_get_text(rel_row, 'RELATIETYPE') or safe_get_text(rel_row, 'PS_RELATIE_TYPE') or safe_get_text(rel_row, 'ORG_RELATIE_TYPE')
            start_date = safe_get_text(rel_row, 'VAN_DATUM')
            end_date = safe_get_text(rel_row, 'TOT_DATUM')
            dead_year = safe_get_text(rel_row, 'DEAD_YR')
            from_date = ''
            utill_date = ''
            if len(relationship_tag.split('_')) > 3 :
                rel_entity_type = relationship_tag.split('_')[3]  # Extract entity type from the tag
            else:
                rel_entity_type = safe_get_text(rel_row, 'RUBRIEK_ID')
            rel_entity_type = entity_type_mapping.get(rel_entity_type)  # Use mapping

            # Add related entity as a node
            add_node(rel_entity_id, rel_entity_description, rel_entity_type, from_date, utill_date, dead_year)
            main_entity_type = entity_type_mapping.get(rel_entity_type)  # Use mapping

            # Add the relationship as an edge
            add_edge(main_entity_id, rel_entity_id, relation_type, start_date, end_date)

# Convert nodes and edges to DataFrames
nodes_df = pd.DataFrame(nodes)
edges_df = pd.DataFrame(edges)

# Save the nodes and edges to CSV files
nodes_df.to_csv('E:/DH/internship/dataset/new_nodes.csv', index=False)
edges_df.to_csv('E:/DH/internship/dataset/new_edges.csv', index=False)

# Display the heads and shapes of the dataframes
nodes_df.head(), edges_df.head(), nodes_df.shape, edges_df.shape

(      ID                                              Label          Type  \
 0  10203  bestand / archief: Archief van de parochie Onz...       archive   
 1  20084  Parochie Onze-Lieve-Vrouw Hemelvaart, Eksaarde...  organization   
 2  26175                                  KSA-bond Eksaarde  organization   
 3  47772                    Dierick, Clementine (1890-1946)        person   
 4  25890                      Druwé, Franciscus (1832-1905)        person   
 
   Van Datum Tot Datum Dead Year  
 0      None      None            
 1                          None  
 2                          None  
 3                          1946  
 4                          1905  ,
   Source Target           Relationship Van Datum Tot Datum
 0  10203  20084                   None      None      None
 1  10203  26175  bevat informatie over      None      None
 2  10203  47772  bevat informatie over                    
 3  10203  25890  bevat informatie over                    
 4    258  13723   

In [28]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = 'E:/DH/internship/dataset/combined.xml'  # Replace with your actual file path
tree = ET.parse(file_path)
root = tree.getroot()

# Initialize lists for nodes and edges
nodes = []
edges = []

# Define mappings for entity types
entity_type_mapping = {
    "AE": "Archief",
    "EV": "Gebeurtenis",
    "OB": "Gebouw",
    "OR": "Organisatie",
    "PS": "Persoon",
    "PB": "Publicatie",
    "FM": "Familie",
}

# Function to add a node if it's not already present
def add_node(node_id, node_name, node_type, start, end, link):
    if not any(node['ID'] == node_id for node in nodes):
        nodes.append({"ID": node_id, "Label": node_name, "Soort": node_type, "Van Datum": start, "Tot Datum": end, "Links": link})

# Function to add an edge
def add_edge(source, target, relationship, start_date, end_date):
    if not any((edge['Source'] == source and edge['Target'] == target) or (edge['Source'] == target and edge['Target'] == source) for edge in edges):
        edges.append({"Source": source, "Target": target, "Soort relatie": relation_type, "Van Datum": start_date, "Tot Datum": end_date})

# Function to safely get text from an element
def safe_get_text(element, tag):
    child = element.find(tag)
    return child.text if child is not None else None

# Extract entities and their relationships
for row in root.findall('.//ROW'):
    # Extract main entity information
    main_entity_id = safe_get_text(row, 'ID')
    main_entity_type = entity_type_mapping.get(safe_get_text(row, 'RUBRIEK'))
    main_entity_description = safe_get_text(row, './/OMSCHRIJVING') or 'No Description'
    main_entity_link = 'http://www.odis.be/lnk/' + safe_get_text(row, 'RUBRIEK') + '_' + main_entity_id

    start_date, end_date = None, None
    
    # Directly find 'SYST_PERIODE_ROW' within each 'ROW' to extract dates
    for period in row.findall('.//SYST_PERIODE_ROW'):
        start_date = safe_get_text(period, 'SYST_VAN_DATUM')
        if safe_get_text(period, 'SYST_TOT_DATUM')!= '9999':
            end_date = safe_get_text(period, 'SYST_TOT_DATUM')


    # Add main entity as a node
    add_node(main_entity_id, main_entity_description, main_entity_type, start_date, end_date, main_entity_link)

    # Find all relationships for the main entity
    for relationship_tag in ['AE_ARCHIEFVORMER_ROW', 'EV_RL_EV_OR_ROW', 'EV_RL_EV_EV_ROW', 'EV_RL_EV_OB_ROW', 'EV_RL_EV_PS_ROW', 'EV_RL_EV_AE_ROW', 'EV_RL_EV_PB_ROW', 'EV_RL_EV_FM_ROW', 'AE_RL_AE_OR_ROW', 'AE_RL_AE_PS_ROW', 'AE_RL_AE_EV_ROW', 'AE_RL_AE_OB_ROW', 'AE_RL_AE_AE_ROW', 'AE_RL_AE_PB_ROW', 'AE_RL_AE_FM_ROW', 'PS_RL_PS_PS_ROW','PS_RL_PS_OR_ROW', 'PS_RL_PS_PB_ROW','PS_RL_PS_OB_ROW', 'PS_RL_PS_EV_ROW', 'PS_RL_PS_AE_ROW', 'PS_RL_PS_FM_ROW', 'OB_RL_OB_OB_ROW', 'OB_RL_OB_PS_ROW', 'OB_RL_OB_EV_ROW', 'OB_RL_OB_AE_ROW', 'OB_RL_OB_OR_ROW', 'OB_RL_OB_PB_ROW', 'OB_RL_OB_FM_ROW', 'OR_RL_OR_OR_ROW', 'OR_RL_OR_PS_ROW', 'OR_RL_OR_EV_ROW', 'OR_RL_OR_AE_ROW', 'OR_RL_OR_OB_ROW', 'OR_RL_OR_PB_ROW', 'OR_RL_OR_FM_ROW']:
        for rel_row in row.findall(f'.//{relationship_tag}'):
            # Extract related entity information
            rel_entity_id = safe_get_text(rel_row, 'ID') or safe_get_text(rel_row, 'EV_ID_RELATIE') or safe_get_text(rel_row, 'ORG_ID_RELATIE') or safe_get_text(rel_row, 'OR_ID_RELATIE') or safe_get_text(rel_row, 'OB_ID_RELATIE') or safe_get_text(rel_row, 'PS_ID_RELATIE') or safe_get_text(rel_row, 'PB_ID_RELATIE') or safe_get_text(rel_row, 'FM_ID_RELATIE') or safe_get_text(rel_row, 'AE_ID_RELATIE')
            rel_entity_description = safe_get_text(rel_row, 'OMSCHRIJVING') or safe_get_text(rel_row, 'EV_RELATIE') or safe_get_text(rel_row, 'PS_RELATIE') or safe_get_text(rel_row, 'ORG_RELATIE') or safe_get_text(rel_row, 'OR_RELATIE') or safe_get_text(rel_row, 'OB_RELATIE') or safe_get_text(rel_row, 'PB_RELATIE') or safe_get_text(rel_row, 'FM_RELATIE') or safe_get_text(rel_row, 'AE_RELATIE')
            relation_type = safe_get_text(rel_row, 'RELATIETYPE') or safe_get_text(rel_row, 'PS_RELATIE_TYPE') or safe_get_text(rel_row, 'ORG_RELATIE_TYPE')
            start_date = safe_get_text(rel_row, 'VAN_DATUM')
            if safe_get_text(rel_row, 'TOT_DATUM') != '9999':
                end_date = safe_get_text(rel_row, 'TOT_DATUM')
            start = ''
            if safe_get_text(rel_row, 'DEAD_YR') != '0':
                end = safe_get_text(rel_row, 'DEAD_YR')
            
            if len(relationship_tag.split('_')) > 3 :
                entity_type = relationship_tag.split('_')[3]  # Extract entity type from the tag
                
            
            elif safe_get_text(rel_row, 'RUBRIEK_ID') == 'ORG':
                entity_type = 'OR'
            elif safe_get_text(rel_row, 'RUBRIEK_ID') == 'PERS':
                    entity_type = 'PS'
        
            rel_entity_link = 'http://www.odis.be/lnk/' + entity_type + '_' + rel_entity_id
            rel_entity_type = entity_type_mapping.get(entity_type)  # Use mapping
            

            # Add related entity as a node
            add_node(rel_entity_id, rel_entity_description, rel_entity_type, start, end, rel_entity_link)
            main_entity_type = entity_type_mapping.get(rel_entity_type)  # Use mapping

            # Add the relationship as an edge
            add_edge(main_entity_id, rel_entity_id, relation_type, start_date, end_date)

# Convert nodes and edges to DataFrames
nodes_df = pd.DataFrame(nodes)
edges_df = pd.DataFrame(edges)

# Save the nodes and edges to CSV files
nodes_df.to_csv('E:/DH/internship/dataset/nodes.csv', index=False)
edges_df.to_csv('E:/DH/internship/dataset/edges.csv', index=False)

# Display the heads and shapes of the dataframes
nodes_df.head(), edges_df.head(), nodes_df.shape, edges_df.shape

(      ID                                              Label        Soort  \
 0  10203  bestand / archief: Archief van de parochie Onz...      Archief   
 1  20084  Parochie Onze-Lieve-Vrouw Hemelvaart, Eksaarde...  Organisatie   
 2  26175                                  KSA-bond Eksaarde  Organisatie   
 3  47772                    Dierick, Clementine (1890-1946)      Persoon   
 4  25890                      Druwé, Franciscus (1832-1905)      Persoon   
 
   Van Datum Tot Datum                            Links  
 0      1652      2010  http://www.odis.be/lnk/AE_10203  
 1                None  http://www.odis.be/lnk/OR_20084  
 2                None  http://www.odis.be/lnk/OR_26175  
 3                1946  http://www.odis.be/lnk/PS_47772  
 4                1905  http://www.odis.be/lnk/PS_25890  ,
   Source Target          Soort relatie Van Datum Tot Datum
 0  10203  20084                   None      None      None
 1  10203  26175  bevat informatie over      None      None
 2  102

In [7]:
import xml.etree.ElementTree as ET
import pandas as pd

# Load and parse the XML file
file_path = 'E:/DH/internship/dataset/combined.xml'  # Replace with your actual file path
tree = ET.parse(file_path)
root = tree.getroot()

# Initialize lists for nodes and edges
nodes = []
edges = []

# Define mappings for entity types
entity_type_mapping = {
    "AE": "Archief",
    "EV": "Gebeurtenis",
    "OB": "Gebouw",
    "OR": "Organisatie",
    "PS": "Persoon",
    "PB": "Publicatie",
    "FM": "Familie",
}

# Function to add a node if it's not already present
def add_node(node_id, node_name, node_type, node_periods, link):
    if not any(node['ID'] == node_id for node in nodes):
        nodes.append({
            "ID": node_id,
            "Label": node_name,
            "Soort": node_type,
            "Bestaansperiode": node_periods,
            "Links": link
        })

# Function to add an edge
def add_edge(source, target, relationship, edge_periods):
    if not any((edge['Source'] == source and edge['Target'] == target) or (edge['Source'] == target and edge['Target'] == source) for edge in edges):
        edges.append({
            "Source": source, 
            "Target": target, 
            "Label": relation_type, 
            "Bestaansperiode": edge_periods
        })
        

# Function to safely get text from an element
def safe_get_text(element, tag):
    child = element.find(tag)
    return child.text if child is not None else None

# Extract entities and their relationships
for row in root.findall('.//ROW'):
    # Extract main entity information
    main_entity_id = safe_get_text(row, 'ID')
    main_entity_type = entity_type_mapping.get(safe_get_text(row, 'RUBRIEK'))
    main_entity_description = safe_get_text(row, './/OMSCHRIJVING') or 'No Description'
    main_entity_link = 'http://www.odis.be/lnk/' + safe_get_text(row, 'RUBRIEK') + '_' + main_entity_id
    time_periods = []

    for period in row.findall('.//SYST_PERIODE_ROW'):
        start = safe_get_text(period, 'SYST_VAN_DATUM')
        end = safe_get_text(period, 'SYST_TOT_DATUM')
    # Replace '9999' with 'heden'
   
        end = end if end != '9999' else 'heden'
    # Only add to the list if at least one of the dates is known
        if start or end:
            period_str = f"{start} – {end}" if start and end else start or end
            time_periods.append(period_str)
        
    # Concatenate all time periods into a string separated by ", "
    time_periods_str = ", ".join(filter(None, time_periods))


    # Add main entity as a node
    add_node(main_entity_id, main_entity_description, main_entity_type, time_periods_str, main_entity_link)

    # Find all relationships for the main entity
    for relationship_tag in ['AE_ARCHIEFVORMER_ROW', 'EV_RL_EV_OR_ROW', 'EV_RL_EV_EV_ROW', 'EV_RL_EV_OB_ROW', 'EV_RL_EV_PS_ROW', 'EV_RL_EV_AE_ROW', 'EV_RL_EV_PB_ROW', 'EV_RL_EV_FM_ROW', 'AE_RL_AE_OR_ROW', 'AE_RL_AE_PS_ROW', 'AE_RL_AE_EV_ROW', 'AE_RL_AE_OB_ROW', 'AE_RL_AE_AE_ROW', 'AE_RL_AE_PB_ROW', 'AE_RL_AE_FM_ROW', 'PS_RL_PS_PS_ROW','PS_RL_PS_OR_ROW', 'PS_RL_PS_PB_ROW','PS_RL_PS_OB_ROW', 'PS_RL_PS_EV_ROW', 'PS_RL_PS_AE_ROW', 'PS_RL_PS_FM_ROW', 'OB_RL_OB_OB_ROW', 'OB_RL_OB_PS_ROW', 'OB_RL_OB_EV_ROW', 'OB_RL_OB_AE_ROW', 'OB_RL_OB_OR_ROW', 'OB_RL_OB_PB_ROW', 'OB_RL_OB_FM_ROW', 'OR_RL_OR_OR_ROW', 'OR_RL_OR_PS_ROW', 'OR_RL_OR_EV_ROW', 'OR_RL_OR_AE_ROW', 'OR_RL_OR_OB_ROW', 'OR_RL_OR_PB_ROW', 'OR_RL_OR_FM_ROW']:
        for rel_row in row.findall(f'.//{relationship_tag}'):
            # Extract related entity information
            rel_entity_id = safe_get_text(rel_row, 'ID') or safe_get_text(rel_row, 'EV_ID_RELATIE') or safe_get_text(rel_row, 'ORG_ID_RELATIE') or safe_get_text(rel_row, 'OR_ID_RELATIE') or safe_get_text(rel_row, 'OB_ID_RELATIE') or safe_get_text(rel_row, 'PS_ID_RELATIE') or safe_get_text(rel_row, 'PB_ID_RELATIE') or safe_get_text(rel_row, 'FM_ID_RELATIE') or safe_get_text(rel_row, 'AE_ID_RELATIE')
            rel_entity_description = safe_get_text(rel_row, 'OMSCHRIJVING') or safe_get_text(rel_row, 'EV_RELATIE') or safe_get_text(rel_row, 'PS_RELATIE') or safe_get_text(rel_row, 'ORG_RELATIE') or safe_get_text(rel_row, 'OR_RELATIE') or safe_get_text(rel_row, 'OB_RELATIE') or safe_get_text(rel_row, 'PB_RELATIE') or safe_get_text(rel_row, 'FM_RELATIE') or safe_get_text(rel_row, 'AE_RELATIE')
            relation_type = safe_get_text(rel_row, 'RELATIETYPE') or safe_get_text(rel_row, 'PS_RELATIE_TYPE') or safe_get_text(rel_row, 'ORG_RELATIE_TYPE')
            
            time_periods = []
            edge_time_periods = []
            start_date = safe_get_text(rel_row, 'VAN_DATUM')
            if safe_get_text(rel_row, 'TOT_DATUM') != '9999':
                end_date = safe_get_text(rel_row, 'TOT_DATUM')
            start = ''
            end = safe_get_text(rel_row, 'DEAD_YR')
            end = '' if end == '0' else (end if end != '9999' else 'heden')
            
            if start or end:
                period_str = f"{start} – {end}" 
                time_periods.append(period_str)
            time_periods_str = ", ".join(time_periods) if time_periods else ''

            if start_date or end_date:
                edge_period_str = f"{start_date} – {end_date}" 
                edge_time_periods.append(edge_period_str)
                
            if len(relationship_tag.split('_')) > 3 :
                entity_type = relationship_tag.split('_')[3]  # Extract entity type from the tag
                
            
            elif safe_get_text(rel_row, 'RUBRIEK_ID') == 'ORG':
                entity_type = 'OR'
            elif safe_get_text(rel_row, 'RUBRIEK_ID') == 'PERS':
                    entity_type = 'PS'
        
            rel_entity_link = 'http://www.odis.be/lnk/' + entity_type + '_' + rel_entity_id
            rel_entity_type = entity_type_mapping.get(entity_type)  # Use mapping
            

            # Add related entity as a node
            add_node(rel_entity_id, rel_entity_description, rel_entity_type, time_periods_str, rel_entity_link)
            main_entity_type = entity_type_mapping.get(rel_entity_type)  # Use mapping

            # Add the relationship as an edge
            add_edge(main_entity_id, rel_entity_id, relation_type, edge_time_periods)

# Convert nodes and edges to DataFrames
nodes_df = pd.DataFrame(nodes)
edges_df = pd.DataFrame(edges)

# Save the nodes and edges to CSV files
nodes_df.to_csv('E:/DH/internship/dataset/nodes.csv', index=False)
edges_df.to_csv('E:/DH/internship/dataset/edges.csv', index=False)

# Display the heads and shapes of the dataframes
nodes_df.head(), edges_df.head(), nodes_df.shape, edges_df.shape

(      ID                                              Label        Soort  \
 0  10203  bestand / archief: Archief van de parochie Onz...      Archief   
 1  20084  Parochie Onze-Lieve-Vrouw Hemelvaart, Eksaarde...  Organisatie   
 2  26175                                  KSA-bond Eksaarde  Organisatie   
 3  47772                    Dierick, Clementine (1890-1946)      Persoon   
 4  25890                      Druwé, Franciscus (1832-1905)      Persoon   
 
   Bestaansperiode                            Links  
 0     1652 – 2010  http://www.odis.be/lnk/AE_10203  
 1                  http://www.odis.be/lnk/OR_20084  
 2                  http://www.odis.be/lnk/OR_26175  
 3          – 1946  http://www.odis.be/lnk/PS_47772  
 4          – 1905  http://www.odis.be/lnk/PS_25890  ,
   Source Target                  Label Bestaansperiode
 0  10203  20084                   None              []
 1  10203  26175  bevat informatie over              []
 2  10203  47772  bevat informatie over    