In [1]:
import rdflib, mimeparse, requests
from rdflib import URIRef
from rdflib.namespace import RDF, RDFS, VOID

In [2]:
abr = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/")
abrc = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/classification/")
abri = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

In [3]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abr", "https://wsburroughs.link/anything-but-routine/")
    g.bind("abrc", "https://wsburroughs.link/anything-but-routine/4.0/classification/")
    g.bind("abri", "https://wsburroughs.link/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://wsburroughs.link/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

In [4]:
headers = { 'Accept': 'text/turtle, application/rdf+xml' }

In [5]:
def parse_response(graph, res):
    (media_type, media_subtype, params) = mimeparse.parse_mime_type(res.headers.get('content-type'))
    graph.parse(data=res.text, format=f'{media_type}/{media_subtype}')

In [6]:
def load_void_resources(graph, uri):
    res = requests.get(uri, headers=headers)
    if res.status_code  == requests.codes.ok:
        parse_response(graph, res)
        dump_file = graph.value(URIRef(uri), VOID.dataDump)
        print(dump_file)
        if dump_file:
            res = requests.get(dump_file, headers=headers)
            if res.status_code  == requests.codes.ok:
                parse_response(graph, res)
        else:
            for o in graph.objects(URIRef(uri), VOID.rootResource):
                find_and_follow(graph, o)

In [7]:
def find_and_follow(graph, uri, level=3):
    if not uri in graph.subjects():
        res = requests.get(uri, headers=headers)
        if res.status_code  == requests.codes.ok:
            parse_response(graph, res)
            if level > 0:
                for o in graph.objects(uri, None):
                    if type(o) == rdflib.term.URIRef:
                        find_and_follow(graph, o, level=level-1)

In [8]:
%%time
graph = initialize_abr_graph()
load_void_resources(graph, 'https://wsburroughs.link/anything-but-routine/')

https://wsburroughs.link/anything-but-routine/dump
CPU times: user 4.1 s, sys: 97.8 ms, total: 4.2 s
Wall time: 17.4 s


In [9]:
n_entities = len([ cl for cl in graph.subjects(RDF.type, bf.Classification) ])
n_entities += len([ w for w in graph.subjects(RDF.type, bf.Work) ])
n_entities += len([ i for i in graph.subjects(RDF.type, bf.Instance) ])

n_entities

324

In [10]:
len([ w for w in graph.subjects(RDF.type, bf.Work) ])

85

In [11]:
len([ i for i in graph.subjects(RDF.type, bf.Instance) ])

229

In [12]:
len([ cl for cl in graph.subjects(RDF.type, bf.Classification) ])

10

In [15]:
agent_names = set()
for s in graph.subjects(RDF.type, bf.Instance):
    for agent in graph.objects(s, bf.contributor):
        name = graph.value(agent, RDFS.label)
        print(agent, name)
        agent_names.add(name)

ub2bL5088C20 William S. Burroughs
ub2bL5372C20 James Grauerholz
ub2bL5376C9 William S. Burroughs
ub2bL1317C20 William S. Burroughs
ub2bL908C20 William S. Burroughs
ub2bL912C9 Carl Solomon
ub2bL6729C9 Allen Ginsberg
ub2bL6725C20 William S. Burroughs
ub2bL951C20 William S. Burroughs
ub2bL990C9 William S. Burroughs
ub2bL986C20 Carl Solomon
ub2bL1818C20 William S. Burroughs
ub2bL6167C20 William S. Burroughs
ub2bL5498C9 S. Clay Wilson
ub2bL5494C20 William S. Burroughs
ub2bL1616C20 William S. Burroughs
ub2bL4715C20 William S. Burroughs
ub2bL3464C20 William S. Burroughs
ub2bL2700C20 William S. Burroughs
ub2bL3238C9 Brion Gysin
ub2bL3234C20 William S. Burroughs
ub2bL592C9 William S. Burroughs
ub2bL584C20 Claude PÃ©lieu
ub2bL600C9 Liam O'Gallagher
ub2bL596C9 Jean-Jacques Lebel
ub2bL588C9 Carl Weissner
ub2bL3882C20 William S. Burroughs
ub2bL4786C20 Oliver Harris
ub2bL4792C9 William S. Burroughs
ub2bL6926C20 William S. Burroughs
ub2bL2130C20 William S. Burroughs
ub2bL1952C20 William S. Burroughs


In [16]:
import spotlight

In [42]:
agent_name_dbpedia_uri_map = {}
for name in agent_names:
    try:
        candidates = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate', 
                                           name.toPython(), 
                                           confidence=0.3, 
                                           support=2)
        agent_name_dbpedia_uri_map[name.toPython()] = candidates[0]['URI']
    except spotlight.SpotlightException:
        print(f"No DBPedia URI found for {name}")

No DBPedia URI found for Francisco
No DBPedia URI found for Jean HÃ©lion
No DBPedia URI found for Arthur Richardson
No DBPedia URI found for Carl Apfelschnitt
No DBPedia URI found for Carl Weissner
No DBPedia URI found for Philippe Mikriammos


In [43]:
agent_name_dbpedia_uri_map

{'Alan Ansen': 'http://dbpedia.org/resource/Alan_Ansen',
 'Alex Wermer-Colan': 'http://dbpedia.org/resource/Alex_Russo',
 'Alfred Aranowitz': 'http://dbpedia.org/resource/Alfred_the_Great',
 'Allen Ginsberg': 'http://dbpedia.org/resource/Allen_Ginsberg',
 'Ann Douglas': 'http://dbpedia.org/resource/Ann_Perkins',
 'Barry Miles': 'http://dbpedia.org/resource/Barry_Miles',
 'Bob Gale': 'http://dbpedia.org/resource/Bob_Gale',
 'Bob Kaufman': 'http://dbpedia.org/resource/Bob_Kaufman',
 'Brion Gysin': 'http://dbpedia.org/resource/Brion_Gysin',
 'Burroughs': 'http://dbpedia.org/resource/William_S._Burroughs',
 'Byron James Bignell': 'http://dbpedia.org/resource/Lord_Byron',
 'Carl Solomon': 'http://dbpedia.org/resource/Carl_Solomon',
 'Casa Sin Nombre': 'http://dbpedia.org/resource/Sin_Nombre_(2009_film)',
 'Christine Taylor': 'http://dbpedia.org/resource/Christine_Taylor',
 'Christof KohlhÃ¶fer': 'http://dbpedia.org/resource/Fer',
 'Claude PÃ©lieu': 'http://dbpedia.org/resource/Claude_Brasse

In [48]:
place_names = set()
publisher_names = set()
for s in graph.subjects(RDF.type, bf.Instance):
    for activity in graph.objects(s, bf.provisionActivity):
        name = graph.value(activity, bf.place)
        if name:
            print(place, name)
            place_names.add(name)
        name = graph.value(activity, bf.agent)
        if name:
            print(agent, name)
            publisher_names.add(name)

ub2bL3574C26 London
ub2bL3550C20 Flamingo
ub2bL3574C26 New York
ub2bL3550C20 Viking Penguin
ub2bL3574C26 New York
ub2bL3550C20 Viking Press
ub2bL3574C26 New York
ub2bL3550C20 Ace Books
ub2bL3574C26 San Francisco
ub2bL3550C20 City Lights Books
ub2bL3574C26 London
ub2bL3550C20 New English Library
ub2bL3574C26 London
ub2bL3550C20 David Bruce & Watson
ub2bL3574C26 New York
ub2bL3550C20 Grove Press
ub2bL3574C26 [London]
ub2bL3550C20 Picador
ub2bL3574C26 Berkeley, Calif.
ub2bL3550C20 Small Press Distribution
ub2bL3574C26 Bonn
ub2bL3550C20 Expanded Media Editions
ub2bL3574C26 London
ub2bL3550C20 Pan/Picador
ub2bL3574C26 New York
ub2bL3550C20 Riverrun Press
ub2bL3574C26 London
ub2bL3550C20 John Calder
ub2bL3574C26 London
ub2bL3550C20 Flamingo
ub2bL3574C26 New York
ub2bL3550C20 Seaver Books
ub2bL3574C26 San Francisco
ub2bL3550C20 Beach Books, Texts, & Documents
ub2bL3574C26 New York
ub2bL3550C20 H. Holt
ub2bL3574C26 New York
ub2bL3550C20 Penguin Books
ub2bL3574C26 Brighton, England
ub2bL3550C20

In [49]:
place_names

{rdflib.term.Literal('Berguette, France'),
 rdflib.term.Literal('Berkeley, Calif.'),
 rdflib.term.Literal('Birmingham, England'),
 rdflib.term.Literal('Bonn'),
 rdflib.term.Literal('Brighton, England'),
 rdflib.term.Literal('Caen, France'),
 rdflib.term.Literal('Cambridge'),
 rdflib.term.Literal('Charleston, W.Va.'),
 rdflib.term.Literal('Cherry Valley, N.Y.'),
 rdflib.term.Literal('Columbus, Ohio'),
 rdflib.term.Literal('DÃ¼sseldorf; New York'),
 rdflib.term.Literal('Forest Knolls, Calif.'),
 rdflib.term.Literal('GÃ¶ttingen'),
 rdflib.term.Literal('Ingatestone, Essex'),
 rdflib.term.Literal('Ingatestone, Essex, England'),
 rdflib.term.Literal('London'),
 rdflib.term.Literal('Louisville, Ky.'),
 rdflib.term.Literal('Madras, India; New York'),
 rdflib.term.Literal('New York'),
 rdflib.term.Literal('Northridge, [UK]'),
 rdflib.term.Literal('Ollon, Switzerland'),
 rdflib.term.Literal('Paris'),
 rdflib.term.Literal('Rouen, France'),
 rdflib.term.Literal('San Francisco'),
 rdflib.term.Liter

In [50]:
place_name_dbpedia_uri_map = {}
for name in place_names:
    try:
        candidates = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate', 
                                           name.toPython(), 
                                           confidence=0.3, 
                                           support=2)
        place_name_dbpedia_uri_map[name.toPython()] = candidates[0]['URI']
    except spotlight.SpotlightException:
        print(f"No DBPedia URI found for {name}")

No DBPedia URI found for [n.p.]
No DBPedia URI found for GÃ¶ttingen


In [51]:
place_name_dbpedia_uri_map

{'Berguette, France': 'http://dbpedia.org/resource/France',
 'Berkeley, Calif.': 'http://dbpedia.org/resource/Berkeley,_California',
 'Birmingham, England': 'http://dbpedia.org/resource/Birmingham',
 'Bonn': 'http://dbpedia.org/resource/Bonn',
 'Brighton, England': 'http://dbpedia.org/resource/Brighton',
 'Caen, France': 'http://dbpedia.org/resource/Caen',
 'Cambridge': 'http://dbpedia.org/resource/Cambridge',
 'Charleston, W.Va.': 'http://dbpedia.org/resource/Charleston,_South_Carolina',
 'Cherry Valley, N.Y.': 'http://dbpedia.org/resource/Cherry_Valley_massacre',
 'Columbus, Ohio': 'http://dbpedia.org/resource/Ohio',
 'DÃ¼sseldorf; New York': 'http://dbpedia.org/resource/New_York',
 'Forest Knolls, Calif.': 'http://dbpedia.org/resource/List_of_neighborhoods_in_San_Francisco',
 'Ingatestone, Essex': 'http://dbpedia.org/resource/Ingatestone',
 'Ingatestone, Essex, England': 'http://dbpedia.org/resource/Ingatestone',
 'London': 'http://dbpedia.org/resource/London',
 'Louisville, Ky.': '

In [52]:
publisher_name_dbpedia_uri_map = {}
for name in publisher_names:
    try:
        candidates = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate', 
                                           name.toPython(), 
                                           confidence=0.3, 
                                           support=2)
        publisher_name_dbpedia_uri_map[name.toPython()] = candidates[0]['URI']
    except spotlight.SpotlightException:
        print(f"No DBPedia URI found for {name}")

No DBPedia URI found for Odd-Job Press
No DBPedia URI found for [n.p.]
No DBPedia URI found for Penguin
No DBPedia URI found for Hand-Job Press
No DBPedia URI found for OU
No DBPedia URI found for Penguin-UK
No DBPedia URI found for OU-Editions
No DBPedia URI found for Flamingo
No DBPedia URI found for Bookslinger
No DBPedia URI found for Cahiers de Nuit


In [53]:
publisher_name_dbpedia_uri_map

{'Ace Books': 'http://dbpedia.org/resource/Ace_Books',
 'Agathon': 'http://dbpedia.org/resource/Agathon',
 'Aloes Books': 'http://dbpedia.org/resource/Agarwood',
 'Aloes seolA [i.e., Aloes Books]': 'http://dbpedia.org/resource/Agarwood',
 'Am Here Books': 'http://dbpedia.org/resource/United_States',
 'American Theatre for Poets': 'http://dbpedia.org/resource/Theatre',
 'Arcade': 'http://dbpedia.org/resource/Arcade_game',
 'Auerhahn Press': 'http://dbpedia.org/resource/Auerhahn_Press',
 'Ballantine Books': 'http://dbpedia.org/resource/Ballantine_Books',
 'Beach Books, Texts, & Documents': 'http://dbpedia.org/resource/Beach',
 'Blackmoor Head Press': 'http://dbpedia.org/resource/Flanaess',
 'Blue Wind Press': 'http://dbpedia.org/resource/Blue_Wind',
 'Bradford Morrow': 'http://dbpedia.org/resource/Bradford_Morrow',
 'Cadmus Editions': 'http://dbpedia.org/resource/Cadmus',
 'Calder': 'http://dbpedia.org/resource/Alexander_Calder',
 'Calder & Boyars': 'http://dbpedia.org/resource/Alexander