To investigate and extract all the photographers partaking in the Zeri Photo Archive we started with some exploratory queries.  We saw that the photographs were linked to their creators by a recursive relation, such that ?photo crm:P94i_was_created_by ?creation . ?creation crm:P14_carried_out_by ?photographer. 
The definition of "Photographer" in this context was "an entity that carried out a creation process which created the resource". This was our first successful query  

In [None]:
my_SPARQL_query = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?photographer  
WHERE { 
  	?x rdf:type <http://www.essepuntato.it/2014/03/fentry/Photograph> ; 
    crm:P94i_was_created_by ?creation .
    ?creation crm:P14_carried_out_by ?photographer .
 }
"""


Once we found the photographers we wanted to count the contributions of each one of them made to the Zeri Archive, and we saw that the property <http://purl.org/spar/pro/holdsRoleInTime> was repeated for each photo they created. 

In [2]:
#Import all the libraries we need at once: 
import rdflib
from rdflib import Namespace
from rdflib.namespace import DCTERMS
from rdflib.namespace import RDFS
from rdflib import URIRef, Literal
from rdflib.namespace import XSD
from SPARQLWrapper import SPARQLWrapper, JSON, GET
import csv 
import pandas as pd
from json import decoder
import requests
import json
import ssl

In [None]:
ssl._create_default_https_context = ssl._create_unverified_context

# get the endpoint API
fototeca_endpoint = "http://data.fondazionezeri.unibo.it/sparql"

# prepare the query : 10 random triples
my_SPARQL_query = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?photographer_label (COUNT(<http://purl.org/spar/pro/holdsRoleInTime>) as ?cnt)
WHERE { 
  	?x rdf:type <http://www.essepuntato.it/2014/03/fentry/Photograph> ; 
    crm:P94i_was_created_by ?creation .
    ?creation crm:P14_carried_out_by ?photographer .
    ?photographer rdfs:label ?photographer_label
 }
GROUP BY ?photographer_label 
ORDER BY DESC(?cnt) ?photographer_label
"""

# set the endpoint 
sparql_ft = SPARQLWrapper(fototeca_endpoint)
# set the query
sparql_ft.setQuery(my_SPARQL_query)
# set the returned format
sparql_ft.setReturnFormat(JSON)
# get the results
results = sparql_ft.query().convert()

with open('photographers.csv', mode='w') as my_file:
    my_writer = csv.writer(my_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
    # write the column names
    my_writer.writerow(['photographer', 'contribution count'])
    for result in results["results"]["bindings"]:
        my_writer.writerow([result['photographer_label']['value'], result['cnt']['value'].strip()])

Looking at the data we see that most of the photographies inside the Zeri Archive are anonymous, and the following four entities are not, apparently, individual people as we first thought. We need now to find each one of the photographers inside another knowledge base to have more informations about them.

In [4]:
data = pd.read_csv("photographers.csv")
data.head()

Unnamed: 0,photographer,contribution count
0,Anonimo,13355
1,Brogi,2213
2,Istituto Centrale per il Catalogo e la Documen...,1539
3,"Alinari, Fratelli",1532
4,Anderson,1192


We decided to find more informations on Wikidata, so the first need was to find each Wikidata ID (this time using the .json format) for the entities identifiable as photographers according to the contextual definition of the Zeri Archive. We managed to do that via the Wikidata API. 
This function saves all the API responses, since many searches yielded more than one result; in the API response there was a key ['search-continue'] which allowed the function to continue to append new findings to the results list. 
In practice, if there is more than one "James Anderson", all of them will be saved to the outcomes, since we don't know if the first "James Anderson" is exactly who we are looking for. 

In [3]:
from json import decoder
from SPARQLWrapper.Wrapper import GET
import requests
import json
import rdflib
import pprint
from rdflib import Namespace
from rdflib.namespace import DCTERMS
from rdflib.namespace import RDFS
from rdflib import URIRef, Literal
from rdflib.namespace import XSD
import numpy as np 
import matplotlib.pyplot as plt 
from SPARQLWrapper import SPARQLWrapper, POST, DIGEST, JSON
from SPARQLWrapper import POST 
import ssl
import csv

ssl._create_default_https_context = ssl._create_unverified_context
#do not use
def to_text(path):
    final_text = list()
    with open(path, newline='') as csvfile:
        photographers = csv.DictReader(csvfile)
        for row in photographers:
            x = str(row['photographer']) 
            y = int(row['contribution count'])
            text = list()
            for n in range(y):
                text.append(x)
            final_text.append(' '.join(text))
    final_string = ' '.join(final_text)
    return final_string
        
def save_to_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

def reverse_string(string): 
    comma = ', '
    string_to_join = ''
    if comma in string: 
        x = string.split(", ")
        string_to_join = str(x[1]) + ' '+ str(x[0])
        return string_to_join
    else: 
        return string

name_file = open('fototeca_photographers.json') 
base_url = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=%s&language=en&format=json&limit=50"

data = json.load(name_file)
dict_of_results = {}
list_of_conceptualuris = []

for idx, row in enumerate(data["results"]["bindings"]):
    search_string = row["photographer_label"]["value"]
    search_string = reverse_string(search_string)
    final_str =  ('+'.join(search_string.split(' '))).strip()
    search_res = requests.get( base_url % final_str).json()
    n_results = len(search_res['search'])
    if(n_results == 0):
        continue

    search_results = []
    search_results.extend(search_res['search'])
    
    if('search-continue' in search_res.keys()):
        any_remaining_data = True
        continue_val = 1
        while(any_remaining_data):
            new_results = requests.get((base_url + ('&continue=%i'%continue_val)) % final_str).json()
            search_results.extend(new_results['search'])
            any_remaining_data ='search-continue' in  new_results.keys()
            continue_val += 1
    for s in search_results:
            list_of_conceptualuris.append(s['concepturi'])

def suit_for_SPARQL_dinner(list_of_uris): 
    bracketed_uris = []
    for uri in list_of_uris:
        suited_uri = '<' + uri + '>'
        bracketed_uris.append(suited_uri)
    return bracketed_uris


uris = ' '.join(suit_for_SPARQL_dinner(list_of_conceptualuris))

def save_to_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

save_to_file(uris, "uris3.txt")

Let's prepare the uris to be finally queried on Wikidata 

In [None]:
from SPARQLWrapper.Wrapper import POST
import rdflib
from rdflib import Namespace
from rdflib.namespace import DCTERMS
from rdflib.namespace import RDFS
from rdflib import URIRef, Literal
from rdflib.namespace import XSD
from SPARQLWrapper import SPARQLWrapper, JSON, GET, POST
import csv 
import pandas as pd
from json import decoder
import requests
import ssl
import json 


ssl._create_default_https_context = ssl._create_unverified_context
sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
sparql.setMethod(POST)

###################################################################################

#adds the uris from a sparql query to a list, its best use is with the function below 
def invitation_list(data, string_to_match):
    invitated_uris = set()
    for result in data["results"]["bindings"]:
        invitated_uris.add(result[string_to_match]["value"]) 
        uris = list(invitated_uris)
    return uris

#once you have the list you can: 
#clean the list (first below)
#suit the uris with the brackets (second below)
#takes a list of uris from a query and adds brackets for a sparql query 

def suit_for_SPARQL_dinner(list_of_uris): 
    bracketed_uris = []
    for uri in list_of_uris:
        suited_uri = '<' + uri + '>'
        bracketed_uris.append(suited_uri)
    return bracketed_uris

#if you need to remove uris from a list, it's a basic linear search 
def remove_uninvited_guests_from_list(uninvited_guests, invitation_list):
    final_set = set()
    if len(invitation_list) < 2: 
        return None
    for person in uninvited_guests: 
        for i in range(len(invitation_list)):
            if (person == invitation_list[i]): 
                print('got out')
                print(person)
            else: 
                final_set.add(invitation_list[i])
    exclusive_list = list(final_set) 
    return exclusive_list

def afterparty_trash(filename, data_to_write):
    with open(filename, 'w') as outfile:
        json.dump(data_to_write, outfile)
##################################################################################

name_file = open('py_files/json_files/final_photographer_Q5.json')
data = json.load(name_file)
uris = suit_for_SPARQL_dinner(invitation_list(data, "photographer"))
string_uris = ' '.join(uris)

#first query: find out if there's some group of people that isn't a Q5 themselves
#if there's someone, check if there's some people related to them and update the list of uris

if_entity = """select ?otherpeople ?photographer
where {VALUES  ?photographer  {"""+string_uris+"""}
    ?photographer wdt:P31 ?o .
    FILTER(?o != wd:Q5) . 
    ?photographer rdfs:label ?label .
    FILTER(LANG(?label) = "en").  
    ?photographer ?property ?otherpeople .
    ?otherpeople wdt:P31 wd:Q5 .
}

GROUP BY ?otherpeople ?photographer"""

sparql.setQuery(if_entity)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()


#now add to the uris the list of new uris, first make a list of them, then 'suit them' with the brackets
uris.extend(suit_for_SPARQL_dinner(invitation_list(results, "otherpeople")))
#create a second list out of the other output, and have a list of the people to remove from the now unpacked list
people_out = suit_for_SPARQL_dinner(invitation_list(results, 'photographer'))
#use the two previous lists 
new_uris = remove_uninvited_guests_from_list(people_out, uris)
new_string_uris = ' '.join(new_uris)
##############################################
print(new_string_uris)

#now let's see the citizenships of the new uris selected
citizenships_query= """
select ?photographer ?label (group_concat(?citizenship) as ?citizenships) ?worklocation
where {VALUES ?photographer {""" + new_string_uris + """}
    ?photographer rdfs:label ?label .
    FILTER(LANG(?label) = "en").  
       optional {
          ?photographer wdt:P27 ?citizenship
}
}
group by ?photographer ?label
"""
sparql.setQuery(citizenships_query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
afterparty_trash('py_files/json_files/citizenships.json', results)

#let's check the dates related to the new uris 

In [None]:
from os import defpath
import networkx as nx
import matplotlib.pyplot as plt
import json 
from collections import defaultdict
from networkx.readwrite import json_graph
from networkx.readwrite.json_graph.node_link import node_link_data
from networkx.readwrite import json_graph;
from itertools import product

def afterparty_trash(filename, data_to_write):
    with open(filename, 'w') as outfile:
        json.dump(data_to_write, outfile)

G = nx.MultiGraph()
name_file = open('json_files/worklocations.json')
data = json.load(name_file)
color_map = []
graphdict = defaultdict()

for row in data["results"]["bindings"]:
    if "worklabel" in row.keys():  
        newk = row["worklabel"]["value"]
        if newk not in graphdict.keys():
            graphdict[newk] = list()

for row in data["results"]["bindings"]:
    if "worklabel" in row.keys(): 
        newv =  row['label']['value']
        city = row['worklabel']['value']
        citizenship = row['worklabel']['value']
        if city in graphdict.keys():
            graphdict[city].append(newv)
print(graphdict)

for key in graphdict: 
    G.add_node(key, vote="city")
    print(key)
    for value in graphdict[key]:
        G.add_node(value, vote="person")
        G.add_edge(key, value)
        

color_map = []
for node, data in G.nodes(data=True):
    if data['vote'] == 'city':
        color_map.append(0.25)  # blue color
    elif data['vote'] == 'person':
        color_map.append(0.7)  # yellow color

nx.draw(G, vmin=0, vmax=1, cmap=plt.cm.jet, node_color=color_map, with_labels=True)
plt.show()

#with open('graph.json', 'w') as outfile:
    #json.dump(json_graph.node_link_data(G))
#x = json_graph.node_link_data(G)
#afterparty_trash('graph.json', x)

    
#italian cities with >= 4 are: Rome, Bologna, Milan, Florence, Venice
higher_n_cities = defaultdict()
italian_cities = ['Rome','Bologna','Milan','Florence','Venice']
for c in italian_cities:
    higher_n_cities[c] = list()

IC = nx.MultiGraph()

for key in graphdict:  
    for key2 in higher_n_cities:
        if key == key2: 
            higher_n_cities[key2] = graphdict[key].copy()

print(higher_n_cities)

for key in higher_n_cities: 
    IC.add_node(key, vote="city")
    for value in higher_n_cities[key]:
        IC.add_node(value, vote="person")
        IC.add_edge(key, value)


color_map = []
for node, data in IC.nodes(data=True):
    if data['vote'] == 'city':
        color_map.append(0.25)  # blue color
    elif data['vote'] == 'person':
        color_map.append(0.7)  # yellow color        


nx.draw(IC, vmin=0, vmax=1, cmap=plt.cm.jet, node_color=color_map, with_labels=True)
plt.show()



FR = nx.MultiGraph()

freqs = {"People and organizations": 10,"Artists, schools, periods": 17,"Genres and themes":11}

for key, value in freqs.items():  
    FR.add_node(key)
    FR.add_node(value)
    FR.add_edge(key, value)

nx.draw(FR, with_labels=True)
plt.show()