# Creating IICONGRAPHarco Knowledge Graph

In this document, we will generate the IICONGRAPHarco Knowledge Graph by first extracting the data from ArCo, translating it (very slow process, could take more than 24 hours), and then converting it to the shortcut version of the ICON ontology. If you want to just reproduce IICONGRAPharco1.0 (the version in Zenodo, jump to section: "Converting ArCo's data"

## Extracting ArCo's data

In this blocks, we extract ArCo's data from its [SPARQL endpoint](https://dati.beniculturali.it/sparql). This endpoint has a limit of 10000 retrievals. We know we want to extract all the entities of type HistoricOrArtisticProperty whose description starts with "lettura iconografica:" (transl= "Iconographic reading:"). Because we can only extract up to 10000 results with this method, we can divide the queries according to the first letter after the ":" We count first all the occurrences after the first letter to make sure we don't have some with more than 10000 results.

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [4]:
def sparql_query_setting(query, endpoint):
    """
    Execute a SPARQL query on a specified endpoint and return the results in JSON format.

    Parameters:
    - query (str): The SPARQL query to be executed.
    - endpoint (str): The endpoint URL for the SPARQL service.

    Returns:
    - dict: The result of the SPARQL query in JSON format.
    """
    # Create a SPARQLWrapper instance and set the endpoint
    sparql = SPARQLWrapper(endpoint)
    
    # Set the SPARQL query
    sparql.setQuery(query)
    
    # Set the returned format to JSON
    sparql.setReturnFormat(JSON)
    
    # Execute the query and convert the results to JSON format
    results = sparql.query().convert()
    
    return results

# this should take around 1-2 minutes
countperletter = dict()
arco_endpoint = "https://dati.cultura.gov.it/sparql"
letters = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
lettersl = letters.split(" ")

for letter in lettersl:
    arco_query = '''
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX arco-cd: <https://w3id.org/arco/ontology/context-description/>
    PREFIX arco-dd: <https://w3id.org/arco/ontology/denotative-description/>
    PREFIX arco: <https://w3id.org/arco/ontology/arco/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>

    SELECT (COUNT(?Concept) AS ?tot) WHERE {{
        ?Concept a <https://w3id.org/arco/ontology/arco/HistoricOrArtisticProperty> ;
                 <https://w3id.org/arco/ontology/core/description> ?description .
        FILTER(REGEX(?description, "lettura iconografica: '''+letter+'''", "i"))
    }} LIMIT 10000
    '''

    # Execute the SPARQL query using the sparql_query_setting function
    res = sparql_query_setting(arco_query, arco_endpoint)

    # Extract and store the results in the countperletter dictionary
    for result in res["results"]["bindings"]:
        number = result["tot"]["value"]
        countperletter[letter] = int(number)


In [5]:
countperletter

{'a': 1565,
 'b': 1261,
 'c': 1677,
 'd': 374,
 'e': 169,
 'f': 11846,
 'g': 624,
 'h': 11,
 'i': 285,
 'j': 17,
 'k': 7,
 'l': 543,
 'm': 1201,
 'n': 304,
 'o': 147,
 'p': 1350,
 'q': 27,
 'r': 1524,
 's': 1377,
 't': 580,
 'u': 232,
 'v': 1765,
 'w': 4,
 'x': 2,
 'y': 3,
 'z': 13}

In [None]:
# We see that the only letter with more than 10000 results is "f",
# therefore we only need to split the query to
# extract the data into two parts for this letter

# We extract the URIs or artworks, their description and the dating

#data_for_letter = dict()
data_for_letter_l = [] #list of tuples is better for the translation part later
#Extract data for every letter and the first 8000 for "f"
# it should take 2-3 minutes
for letter in lettersl:
    arco_query = '''
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX arco-cd: <https://w3id.org/arco/ontology/context-description/>
    PREFIX arco-dd: <https://w3id.org/arco/ontology/denotative-description/>
    PREFIX arco: <https://w3id.org/arco/ontology/arco/>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>

    select distinct ?Concept ?description ?date where {

 ?Concept a <https://w3id.org/arco/ontology/arco/HistoricOrArtisticProperty> ;
<https://w3id.org/arco/ontology/core/description> ?description .
FILTER(regex(?description, "lettura iconografica: '''+letter+'''", "i"))
OPTIONAL { ?Concept dc:date ?date }
} LIMIT 8000
    
    '''     
    res = sparql_query_setting(arco_query, arco_endpoint)
    for result in res["results"]["bindings"]:
        concept = result["Concept"]["value"]
        description = result["description"]["value"]
        date = result["date"]["value"]
        #data_for_letter[concept] = {"description":description, "date":date}
        data_for_letter_l.append((concept, description, date))

#take the remaining of "F"
arco_query = '''
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX arco-cd: <https://w3id.org/arco/ontology/context-description/>
        PREFIX arco-dd: <https://w3id.org/arco/ontology/denotative-description/>
        PREFIX arco: <https://w3id.org/arco/ontology/arco/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
    
        select distinct ?Concept ?description ?date where {
    
     ?Concept a <https://w3id.org/arco/ontology/arco/HistoricOrArtisticProperty> ;
    <https://w3id.org/arco/ontology/core/description> ?description .
    FILTER(regex(?description, "lettura iconografica: '''+"f"+'''", "i"))
    OPTIONAL { ?Concept dc:date ?date }
    } OFFSET 8000 LIMIT 4000 '''
res = sparql_query_setting(arco_query, arco_endpoint)
for result in res["results"]["bindings"]:
    concept = result["Concept"]["value"]
    description = result["description"]["value"]
    date = result["date"]["value"]
    #data_for_letter[concept] = {"description":description, "date":date}
    data_for_letter_l.append((concept, description, date))


### First Cleaning

Methodology for cleaning: user choices according to the category.
**Hint**: "Nomi" and "Luoghi" does not contain useful information. If you want to create IICONGRAPHarco you can skip until **converting ArCo's Data**.

In [None]:
# Set of types that should be excluded
excluded = set()

# Set of types that have been checked and confirmed by the user
checked = set()

def reconstruct(text):
    """
    Reconstructs a string by processing and checking its parts.

    Args:
    - text (str): The input string to be reconstructed.

    Returns:
    - str: The reconstructed string if successful, otherwise False.
    """
    # Check if the last character is a period and split the text accordingly
    if text[-1] == ".":
        parts = text[:-1].split(".")
    else:
        parts = text.split(".")

    newtext = ""

    # Iterate over parts and process each one
    for part in parts:
        # Split the part into type and text
        type_text = part.split(":")

        # Check if the format is correct
        if len(type_text) != 2:
            return False

        # Check if the type should be excluded
        if type_text[0] in excluded:
            continue
        # Check if the type has been previously checked
        elif type_text[0] in checked:
            newtext += ":".join(type_text) + "."
        else:
            print(type_text[0])
            a = input("Is this good?")
            # Ask the user if the type is good
            if a == "y":
                checked.add(type_text[0])
                newtext += ":".join(type_text) + "."
            else:
                excluded.add(type_text[0])
                continue

    # Check if the reconstructed string is not empty
    if len(newtext) > 1:
        # Add a period at the end if needed
        if newtext[-1] != ".":
            newtext += "."
        return newtext
    else:
        return False


In [None]:
data_ico_clean_1 = list()
for el in data_for_letter_l_ico:
    resu = reconstruct(el[1])
    if resu is False:
        continue
    else:
        data_ico_clean_1.append((el[0], resu, el[2]))

## Translating ArCo's data

This step could take more than 24H, if you want to recreate IICONGRAPHarco1.0, you can skip this step and go to **Converting ArCo's data**

In [None]:
from deep_translator import (GoogleTranslator,
                             ChatGptTranslator,
                             MicrosoftTranslator,
                             PonsTranslator,
                             LingueeTranslator,
                             MyMemoryTranslator,
                             YandexTranslator,
                             PapagoTranslator,
                             DeeplTranslator,
                             QcriTranslator,
                             single_detection,
                             batch_detection)
import time

In [None]:
# List to store translated elements
arcofinal = []

# Iterate over elements in the data_ico_clean_1 list
for el in tqdm(data_ico_clean_1):
    try:
        # Check if the index is within the range of arco_transl list
        if data_ico_clean_1.index(el) >= len(arcofinal):
            # Extract the text to be translated
            text = el[1]

            # Translate the text from Italian to English using Google Translator
            t2 = GoogleTranslator(source="it", target="en").translate(text=text)

            # Add a delay to avoid excessive API requests
            time.sleep(3)

            # Append the translated element to arco_transl list
            arcofinal.append((el[0], t2, el[2]))
    except:
        # Handle translation failure
        print("Translation failed at element number " + str(len(arcofinal) + 1) + " out of " + str(len(data_ico_clean_1)))
        break


## Converting ArCo's data

In this step, we will convert ArCo's triples to the structure of ICON 2.0 using 

### IF YOU SKIPPED THE PREVIOUS STEPS

You can import the arco_upload.p file via pickle

In [14]:
import pickle
with open("arco_upload.p", "rb") as input_file:
    arcofinal = pickle.load(input_file)

### Preliminary steps

We first parse HyperReal and we create some secondary functions to handle the alignment between ArCo's elements and HyperReal

In [None]:
import urllib.parse
import rdflib
from rdflib import URIRef, BNode, Literal, Graph, Namespace, ConjunctiveGraph
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
from rdflib import namespace
import pandas

from rdflib import Graph, Namespace, RDFS

# Create an instance of the Graph
hr = Graph()

# Parse the RDF data from the specified URL in Turtle format
hr.parse("https://raw.githubusercontent.com/br0ast/simulationontology/main/KG/kg.ttl", format="ttl")

# Define namespaces for easier access
sim_on = "https://w3id.org/simulation/ontology/"

sim_n = Namespace(sim_on)  # Replace with the actual URI
hr.bind("sim", sim_n)

hrdata = "https://w3id.org/simulation/data/"
hrd = Namespace(hrdata)
hr.bind("hr", hrd)

# Set to store unique types
setoftypes = set()

# Iterate through objects with the predicate sim_n.hasSimulacrum
for o in hr.objects(None, sim_n.hasSimulacrum, None):
    # Retrieve labels for each object
    for lab in hr.objects(o, RDFS.label, None):
        # Check if the label contains "("
        if "(" in str(lab):
            # Add the label to the set
            setoftypes.add(str(lab))

# Create a set to store types
setoftypes2 = set()

# Iterate over elements in setoftypes
for el in setoftypes:
    # Extract the type from the element
    typ = el.split("(")[1].split(")")[0]
    setoftypes2.add(typ)

# Remove types with spaces in the set
setoftypes2 = {el for el in setoftypes2 if " " not in el}

def combinewithtype(string):
    """
    Combine a string with each type from setoftypes2.

    Args:
    - string (str): The input string.

    Returns:
    - list: A list of strings where each element is the input string combined with a type.
    """
    # Create a list to store combined strings
    listoftypes = []

    # Iterate over types in setoftypes2
    for typ in setoftypes2:
        # Create a new string by combining the input string and the title-cased type
        new_string = string + to_camel_case(typ).title()        
        # Append the new string to the list
        listoftypes.append(new_string)
    # Return the list of combined strings
    return listoftypes


#### Auxilliary function for the camelCase URIs

In [15]:
import re
import string

def to_camel_case(input_string):
    """
    Convert a string to CamelCase.

    Args:
    - input_string (str): The input string.

    Returns:
    - str: The CamelCase version of the input string.
    """
    # Remove punctuation from the input string
    input_string = input_string.translate(str.maketrans('', '', string.punctuation))

    # Split the input string into words using space and underscore as delimiters
    words = re.split(r'[_\s]+', input_string)

    # Capitalize the first letter of each word (except the first word)
    camel_words = [words[0].lower()] + [word.capitalize() for word in words[1:]]

    # Join the words together to form the CamelCase string
    camel_case_string = ''.join(camel_words)

    return camel_case_string


### Main Symbolism Function / Match function

In [None]:
not_found = set()

def add_symb(graph, artwork_uri, label):
    """
    Add symbolic information to the knowledge graph based on the given label.

    Args:
    - graph: The RDF graph to which information will be added.
    - artwork_uri (str): URI of the artwork.
    - label (str): Label for which symbolic information is to be added.

    Returns:
    - str: A string describing the symbolic information added to the graph.
    """
    string_to_return = []
    possible_labels = combinewithtype(to_camel_case(label))
    possible_labels.append(to_camel_case(label))
    
    if label[-1] == "s":
        label2 = label[:-1]
        possible_labels.extend(combinewithtype(to_camel_case(label2)))
        possible_labels.append(to_camel_case(label2))
    
    for lab in possible_labels:
        if lab not in not_found:
            if (None, sim_n.hasSimulacrum, URIRef(hrdata + lab)) in hr:
                simumu = hrdata + lab
                resu = hr.query("""
                                prefix kb: <https://w3id.org/simulation/data/>
                                prefix owl: <http://www.w3.org/2002/07/owl#>
                                prefix prov: <http://www.w3.org/ns/prov#>
                                prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                                prefix sim: <https://w3id.org/simulation/ontology/>
                                prefix wn: <http://wordnet-rdf.princeton.edu/lemma/>
                                select ?simulation ?context ?rc ?rctype ?simulationtype ?contextLabel ?rcLabel where {
                                    values ?rctype {sim:hasRealityCounterpart sim:elicitedRealityCounterpart sim:preventedRealityCounterpart sim:easedRealityCounterpart
                                    sim:healedRealityCounterpart sim:restoredRealityCounterpart}
                                    ?simulation sim:hasSimulacrum <""" + simumu + """>;
                                    ?rctype ?rc;
                                    sim:hasContext ?context;
                                    a ?simulationtype .
                                    ?context rdfs:label ?contextLabel .
                                    ?rc rdfs:label ?rcLabel .
                                    FILTER (?context != kb:generalOrUnknown)
                                }""")
                
                for el in resu:
                    if el[0]:
                        graph.add((arturi, icon_n.iconographicallyDepicts, URIRef(el[0])))
                        graph.add((URIRef(el[0]), RDF.type, URIRef(el[4])))
                        graph.add((URIRef(el[0]), sim_n.hasSimulacrum, URIRef(simumu)))
                        graph.add((URIRef(el[0]), URIRef(el[3]), URIRef(el[2])))
                        graph.add((URIRef(el[0]), sim_n.hasContext, URIRef(el[1])))
                        
                        if str(el[4]) == "https://w3id.org/simulation/ontology/ProtectionSimulation":
                            string_to_return.append("symbolic protection against " + str(el[6]) + " in a " + str(el[5]) + " cultural context")
                        elif str(el[4]) == "https://w3id.org/simulation/ontology/AttributeSimulation":
                            string_to_return.append("symbolic attribute of " + str(el[6]) + " in a " + str(el[5]) + " cultural context")
                        elif str(el[4]) == "https://w3id.org/simulation/ontology/HealingSimulation":
                            string_to_return.append("symbolic cure for " + str(el[6]) + " in a " + str(el[5]) + " cultural context")
                        elif str(el[4]) == "https://w3id.org/simulation/ontology/AssociationSimulation":
                            string_to_return.append("symbolically associated with " + str(el[6]) + " in a " + str(el[5]) + " cultural context")
                        else:
                            string_to_return.append("symbol of " + str(el[6]) + " in a " + str(el[5]) + " cultural context")
                        
            else:
                not_found.add(lab)
    
    if len(string_to_return) > 0:
        result = "(" + ", ".join(string_to_return) + ")"
        return result
    else:
        return "nope"


### KG construction

In [None]:
from tqdm import tqdm
from rdflib import ConjunctiveGraph, RDF, RDFS, Literal, URIRef, Namespace

# Define namespaces
icon = "https://w3id.org/icon/ontology/"
uarco = "https://w3id.org/iicongraph/data/"
dolce = "http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#"
sim_on = "https://w3id.org/simulation/ontology/"
hrdata = "https://w3id.org/simulation/data/"
arcor = "https://w3id.org/arco/resource/HistoricOrArtisticProperty/"
hrd = Namespace(hrdata)
icon_n = Namespace(icon)
uarcon = Namespace(uarco)
arcorn = Namespace(arcor)
dolcen = Namespace(dolce)
sim_n = Namespace(sim_on)

# Bind namespaces to the RDF graph
g = ConjunctiveGraph()
g.bind("hr", hrd)
g.bind("icon", icon_n)
g.bind("dul", dolcen)
g.bind("iig", uarcon)
g.bind("sim", sim_n)
g.bind("arcor", arcorn)

# List to store complex cases
complex = []

for arco_tup in tqdm(arcofinal):
    if ";" in arco_tup[1] and "." in arco_tup[1]:
        arturi = URIRef(arco_tup[0])
        g.add((arturi, RDF.type, icon_n.Artwork))
        category_text = arco_tup[1].replace("St.", "Saint")
        category_text = category_text.split(".")
        
        for cat_t in category_text:
            cat = cat_t.split(":")[0].lower()
            if "product category" in cat or "event type" in cat:
                text = cat_t.split(":")[1].strip()
                iconologicals = text.split(";")
                iconologicals = ["promotion of "+ word.strip() for word in iconologicals]
                iconologicals_camel = [to_camel_case(word) for word in iconologicals]
                for i, el in enumerate(iconologicals_camel):
                    g.add((arturi, icon_n.iconologicallyRepresents, URIRef(uarco + el)))
                    g.add((URIRef(uarco + el), RDFS.label, Literal(iconologicals[i])))
            else:
                if ":" in cat_t:
                    text = cat_t.split(":")[1].strip()
                    text_reading_split = text.split(";")
                    text_reading_split = [word.strip() for word in text_reading_split]
                    
                    if check_no_capital(text_reading_split) is True:
                        if "allegory" in icon_reading or "symbol" in text:
                            complex.append(arco_tup)
                        else:
                            text_reading_split_camel = [to_camel_case(word) for word in text_reading_split]
                            for i, word in enumerate(text_reading_split):
                                if len(word) > 1:
                                    if check_no_capital_el(word) is True:
                                        g.add((arturi, icon_n.iconographicallyDepicts, URIRef(uarco + text_reading_split_camel[i])))
                                        g.add((URIRef(uarco + text_reading_split_camel[i]), RDFS.label, Literal(word)))
                                        symbolism = add_symb(g, arturi, word)
                                    else:
                                        g.add((arturi, icon_n.preiconographicallyDepicts, URIRef(uarco + text_reading_split_camel[i])))
                                        g.add((URIRef(uarco + text_reading_split_camel[i]), RDFS.label, Literal(word)))
                                        symbolism = add_symb(g, arturi, word)
                    else:
                        if "allegory" in icon_reading or "symbol" in text:
                            complex.append(arco_tup)
                        else:
                            text_reading_split_camel = [to_camel_case(word) for word in text_reading_split]
                            for i, word in enumerate(text_reading_split):
                                if len(word) > 1:
                                    g.add((arturi, icon_n.preiconographicallyDepicts, URIRef(uarco + text_reading_split_camel[i])))
                                    g.add((URIRef(uarco + text_reading_split_camel[i]), RDFS.label, Literal(word)))
                                    symbolism = add_symb(g, arturi, word)
        '''icon_reading = first_half.split(":")[1]
        icon_reading_split = icon_reading.split(";")
        icon_reading_split = [word.strip() for word in icon_reading_split]
        if check_no_capital(icon_reading_split) is True:
            if "allegory" in icon_reading or "symbol" in icon_reading:
                complex.append(el)
        else:
            if "allegory" in icon_reading or "symbol" in icon_reading:
                complex.append(el)'''