In [2]:
import rdflib
from rdflib import Namespace , Literal , URIRef
from rdflib.namespace import RDF , RDFS

In [2]:
g = rdflib.ConjunctiveGraph()

# parse a local RDF file by specifying the format
result = g.parse("artchives_birthplaces.nq", format='nquads')

In [3]:
wd = Namespace("http://www.wikidata.org/entity/") # remember that a prefix matches a URI until the last slash (or hashtag #)
wdt = Namespace("http://www.wikidata.org/prop/direct/")
art = Namespace("https://w3id.org/artchives/")

In [4]:
# Get the list of art historians in our graph "g"
arthistorians_list = set()

# iterate over the triples in the graph
for s,p,o in g.triples(( None, wdt.P170, None)):   
    if "wikidata.org/entity/" in str(o):           
        arthistorians_list.add('<' + str(o) + '>')     
    
print(arthistorians_list)

{'<http://www.wikidata.org/entity/Q1089074>', '<http://www.wikidata.org/entity/Q90407>', '<http://www.wikidata.org/entity/Q1296486>', '<http://www.wikidata.org/entity/Q6700132>', '<http://www.wikidata.org/entity/Q1715096>', '<http://www.wikidata.org/entity/Q457739>', '<http://www.wikidata.org/entity/Q60185>', '<http://www.wikidata.org/entity/Q1373290>', '<http://www.wikidata.org/entity/Q18935222>', '<http://www.wikidata.org/entity/Q55453618>', '<http://www.wikidata.org/entity/Q2824734>', '<http://www.wikidata.org/entity/Q1271052>', '<http://www.wikidata.org/entity/Q1641821>', '<http://www.wikidata.org/entity/Q19997512>', '<http://www.wikidata.org/entity/Q61913691>', '<http://www.wikidata.org/entity/Q995470>', '<http://www.wikidata.org/entity/Q1629748>', '<http://www.wikidata.org/entity/Q85761254>', '<http://www.wikidata.org/entity/Q3051533>', '<http://www.wikidata.org/entity/Q41616785>', '<http://www.wikidata.org/entity/Q3057287>', '<http://www.wikidata.org/entity/Q537874>', '<http://w

In [181]:
#volevo ottenere solo i labels degli storici ma qualcosa è sbagliato
from rdflib.namespace import RDFS
AH = set()

for s,p,o in g.triples((wd.Q5, wdt.P170, None)):
    for s1,p1,o1 in g.triples((RDFS.label, p, None)):
        AH.add(s1.strip())
        
print(AH)

set()


# Step 1
We import the .csv file of the Database of Art Historians filtered by nationality ('it') and create a pandas dataframe to easily store and manipulate this information.

In [1]:
import pandas as pd

In [2]:

# create first dataframe only using the specified columns 
data = pd.read_csv("DoAH_StoriciItaliani_csvseparatodavirgole.csv",
                    usecols=["Full Name", "Gender", "Archives"])

# axis 0 to drop the rows, subset to only remove NaNs from the column Archives
data.dropna(axis=0, subset=["Archives"], inplace=True)

# remove all duplicate rows
data.drop_duplicates(inplace=True)

# resetting the index because all deleted rows have changed the length of the dataframe
data.reset_index(inplace=True, drop=True)

# .pickle is a python serialization format for easy and quick read-write, and pandas supports it natively
data.to_pickle("database_degli_eroi.pickle")

# the first table we have looks like this:
data.head()

Unnamed: 0,Full Name,Gender,Archives
0,"Accascina, Maria",female,"Comune di Palermo Concetta Di Natale, Maria Ma..."
1,"Argan, Giulio Carlo",male,Private Archive in Rome.
2,"Bellosi, Luciano",male,"\n\tArchivio Lucino Bellosi, Biblioteca Umanis..."
3,"Bertini Calosso, Achille",male,"Fondo Bertini Calosso, Istituto Nazionale di A..."
4,"Bianchi Bandinelli, Ranuccio",male,"Ranuccio Bianchi Bandinelli, lâ€™Archivio di S..."


# To resolve:
0. utf-8 chars are not always displayed correctly: fix with regex?
1. full names are reversed (`surname, name`)
2. we need to have a controlled entity (`wd:xyz`) for each name, to be able to link them to other info
3. archives are a list of strings 

In [3]:
def reformat_names(name):
    """ reverse names from surname,name format to name surname """
    l = name.split(", ")
    new = " ".join(reversed(l))
    return new

In [4]:
# reverse names and remove duplicate whitespace
data["Full Name"] = data["Full Name"].apply(reformat_names)
data["Full Name"] = data["Full Name"].apply(lambda x: x.replace("  ", " "))
data["Full Name"] = data["Full Name"].apply(lambda x: x.replace("-", " "))

data.head()

Unnamed: 0,Full Name,Gender,Archives
0,Maria Accascina,female,"Comune di Palermo Concetta Di Natale, Maria Ma..."
1,Giulio Carlo Argan,male,Private Archive in Rome.
2,Luciano Bellosi,male,"\n\tArchivio Lucino Bellosi, Biblioteca Umanis..."
3,Achille Bertini Calosso,male,"Fondo Bertini Calosso, Istituto Nazionale di A..."
4,Ranuccio Bianchi Bandinelli,male,"Ranuccio Bianchi Bandinelli, lâ€™Archivio di S..."


In [5]:
# python3 -m pip install qwikidata
# python library for working with sparql and linked data from WikiData
from qwikidata.sparql import return_sparql_query_results



In [6]:
historian_entity_from_label = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?artHistorian ?o
WHERE {{
    OPTIONAL{{
        ?artHistorian wdt:P106 wd:Q1792450 .
    }}
    ?artHistorian wdt:P31 wd:Q5 .
    ?artHistorian rdfs:label ?o 
                  FILTER ( str(?o) = "{}" )  
                  FILTER ( lang(?o) = "it" ).
}}
"""

In [7]:
def find_historian_entity_from_name(name: str):
    if name.startswith("conte"):
        name = name.lstrip("conte ")
    query = historian_entity_from_label.format(name)
    res = return_sparql_query_results(query_string=query)
    try:
        wdt_uri = res['results']['bindings'][0]['artHistorian']['value']
    except (IndexError, KeyError):
        return ""
    return wdt_uri.split("/")[-1]

In [8]:
data["Historian Entity"] = data["Full Name"].apply(find_historian_entity_from_name)
data.to_pickle("database_degli_eroi.pickle")
data.head()

Unnamed: 0,Full Name,Gender,Archives,Historian Entity
0,Maria Accascina,female,"Comune di Palermo Concetta Di Natale, Maria Ma...",Q98804253
1,Giulio Carlo Argan,male,Private Archive in Rome.,Q778445
2,Luciano Bellosi,male,"\n\tArchivio Lucino Bellosi, Biblioteca Umanis...",Q1058859
3,Achille Bertini Calosso,male,"Fondo Bertini Calosso, Istituto Nazionale di A...",Q19753744
4,Ranuccio Bianchi Bandinelli,male,"Ranuccio Bianchi Bandinelli, lâ€™Archivio di S...",Q471179
