# Step 1
We import the .csv file of the Database of Art Historians filtered by nationality ('it') and manually integrated with Wikidata info. Then, we create a pandas dataframe to easily store and manipulate this information.

In [1]:
import pandas as pd

# python3 -m pip install qwikidata
# python library for working with sparql and linked data from WikiData
from qwikidata.sparql import return_sparql_query_results

In [2]:
# create first dataframe only using the specified columns 
data = pd.read_csv("DoAH_StoriciItaliani_integrato.csv", sep=",",
                    usecols=["Full Name", "Gender", "Collection", "Keeper"])

# axis 0 to drop the rows, subset to only remove NaNs from the column Archives
data.dropna(axis=0, subset=["Keeper"], inplace=True)

# resetting the index because all deleted rows have changed the length of the dataframe
data.reset_index(inplace=True, drop=True)

# .pickle is a python serialization format for easy and quick read-write, and pandas supports it natively
data.to_pickle("00_first_db.pickle")

# the first table we have looks like this:
pd.set_option("display.max_rows", None)
data.head(120)

Unnamed: 0,Full Name,Gender,Collection,Keeper
0,"Accascina, Maria",female,,Comune di Palermo
1,"Agostini, Leonardo",male,,Scuola Normale Superiore
2,"Alfieri, Vittorio",male,,Biblioteca Medicea Laurenziana
3,"Alinari, Giuseppe",male,Archivio Alinari,Museo Nazionale Alinari della Fotografia
4,"Alinari, Leopoldo",male,Archivio Alinari,Museo Nazionale Alinari della Fotografia
5,"Arcangeli, Francesco",male,"Fondo speciale Angelo, Gaetano, Bianca e Franc...",Biblioteca comunale dell'Archiginnasio
6,"Aretino, Pietro",male,Fondo Bongi,State Archives of Lucca
7,"Argan, Giulio Carlo",male,,Private archive in Rome
8,"Arias, Paolo Enrico",male,,Scuola Normale Superiore
9,"Baglione, Giovanni",male,,Archivio di Stato di Roma


# To resolve:
1. full names are reversed (`surname, name`)
2. we need to have a controlled entity (`wd:xyz`) for each name and keeper, to be able to link them to other info

In [3]:
def reformat_names(name):
    """ reverse names from surname,name format to name surname """
    l = name.split(", ")
    new = " ".join(reversed(l))
    return new

In [4]:
# reverse names and remove duplicate whitespace
data["Full Name"] = data["Full Name"].apply(reformat_names)

data.describe()
data.head()

Unnamed: 0,Full Name,Gender,Collection,Keeper
0,Maria Accascina,female,,Comune di Palermo
1,Leonardo Agostini,male,,Scuola Normale Superiore
2,Vittorio Alfieri,male,,Biblioteca Medicea Laurenziana
3,Giuseppe Alinari,male,Archivio Alinari,Museo Nazionale Alinari della Fotografia
4,Leopoldo Alinari,male,Archivio Alinari,Museo Nazionale Alinari della Fotografia


In [8]:
historian_entity_from_label = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?artHistorian

WHERE {{
    ?artHistorian rdfs:label ?o
                  FILTER ( str(?o) = "{}" )  .
  
    ?artHistorian wdt:P31 wd:Q5 ;
                  wdt:P1412 ?language
                  FILTER (?language IN (wd:Q652, wd:Q397 ) ) .
    ?artHistorian wdt:P106 ?occupation
                  FILTER (?occupation IN (wd:Q1792450, wd:Q201788, wd:Q1622272, wd:Q3621491, wd:Q483501, wd:Q4164507, wd:Q4964182, wd:Q5697103, wd:Q33231, wd:Q1281618, wd:Q39631, wd:Q36180, wd:Q49757, wd:Q42973 ) ) .   
}}
"""

In [9]:
def find_historian_entity_from_name(name: str):
    query = historian_entity_from_label.format(name)
    res = return_sparql_query_results(query_string=query)
    try:
        wdt_uri = res['results']['bindings'][0]['artHistorian']['value']
    except (IndexError, KeyError):
        return ""
    return wdt_uri.split("/")[-1]

In [10]:
data["Historian Entity"] = data["Full Name"].apply(find_historian_entity_from_name)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
data.to_pickle("00_first_db.pickle")

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
historian_entity_from_label_edited = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?artHistorian

WHERE {{
    OPTIONAL {{
      ?artHistorian wdt:P1412 wd:Q652
    }}
  
    ?artHistorian rdfs:label ?o 
                  FILTER ( str(?o) = "{}" )  .
    
  
    ?artHistorian wdt:P31 wd:Q5 .
}}
"""

In [None]:
def find_remaining_entity_from_name(row):
    if row["Historian Entity"]:
        return row["Historian Entity"]
    name = row["Full Name"]
    query = historian_entity_from_label_edited.format(name)
    res = return_sparql_query_results(query_string=query)
    try:
        wdt_uri = res['results']['bindings'][0]['artHistorian']['value']
    except (IndexError, KeyError):
        return ""
    return wdt_uri.split("/")[-1]

In [63]:
data = pd.read_pickle("00_first_db.pickle")

data["Historian Entity"] = data[["Full Name", "Historian Entity"]].apply(find_remaining_entity_from_name, axis=1)


Q98804253
Q1121086
Q778445
Q979574
Q19754060
Q1058859
Q19753744
Q471179
Q18934852
Q572091
Q428638
Q3617742
Q3892735
Q3768943
Q1010831
Q3055532
Q708563
Q2754987
Q913963
Q721545
Q192348
Q17453085
Q19754082
Q3852590
Q3770555
Q1757617
Q1555828
Q470377
Q3339580
Q19753788
Q2019868
Q1372695
Q19754125
Q19997569
Q481474
Q972400
Q1361667
Q19997557
Q3694262
Q1160847
Q3769699
Q19753755
Q4007892
Q323009
Q2262449
Q3438258
Q962495
Q183433
Q16271446
Q19753888
Q176938
Q981971
Q3107317
Q2474041
Q3694305
Q776990
Q7002138
Q6700132
Q3849075
Q3904250
Q15997104
Q59533611
Q2824734
Q2527217
Q177450
Q3849335
Q1089074
Q558155


In [67]:
data.head()



Unnamed: 0,Full Name,Gender,Collection,Keeper,Historian Entity
0,Maria Accascina,female,,Comune di Palermo\n,Q98804253
1,Leonardo Agostini,male,,Scuola Normale Superiore,Q1054161
2,Vittorio Alfieri,male,,Biblioteca Medicea Laurenziana,
3,Giuseppe Alinari,male,Archivio Alinari,Museo Nazionale Alinari della Fotografia,
4,Leopoldo Alinari,male,Archivio Alinari,Museo Nazionale Alinari della Fotografia,


In [None]:
# new query, to test
# just written and tested on wikidata interface, it returns 4 results for vittorio alfieri and the first is wrong
# but for some entities it works!

# the unions are all different ways to say "italy"
new_query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?artHistorian

WHERE {{
  
    { ?artHistorian wdt:P27 wd:Q172579 } UNION 
    { ?artHistorian wdt:P27 wd:Q38 } UNION 
    { ?artHistorian wdt:P27 wd:Q223936 } UNION
    { ?artHistorian wdt:P27 wd:Q48742118 } UNION
    { ?artHistorian wdt:P27 wd:Q5343710 } .
  
    OPTIONAL {{
      ?artHistorian wdt:P1412 wd:Q652
    }}
  
    ?artHistorian rdfs:label ?o 
                  FILTER ( str(?o) = "Giuseppe Alinari" )  .
    
  
    ?artHistorian wdt:P31 wd:Q5 .
}}

"""

