## Load packages

In [243]:
%matplotlib inline 
import rdflib, glob, re, pandas as pd
from rdflib import URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, DCTERMS

## Define namespaces

In [2]:
abri = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

## Parse all edited .ttl files into a graph

In [112]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abri", "https://w3id.org/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://w3id.org/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

In [234]:
g = initialize_abr_graph()
for infile in glob.glob("edited-ttl/*/*.ttl"):
    g.parse(infile, format='n3')
n = len(g)
print(f"ABR graph has {n} triples.")

ABR graph has 8691 triples.


## Build dataframe with instance binding descriptions from notes

In [235]:
bindings = []

for instance in g.subjects(RDF.type, bf.Instance):
    binding_desc = 'None.'
    inst_uri = instance.toPython()
    inst_id = inst_uri[inst_uri.rfind('/')+1:]
    for note in g.objects(instance, bf.note):
        text = g.value(note, RDF.value).toPython()
        if re.search('^\w+bound|^Pamphlet|^Broadside|^Folio', text):
            binding_desc = text
    bindings.append([inst_uri, inst_id, binding_desc])
                
df = pd.DataFrame(bindings, columns=['instance', 'id', 'binding'])

## Clean out noise binding values and show value counts

In [236]:
df = df[(df['binding'].str.contains('Hardbound.+copies') == False)]
df = df[(df['binding'].str.contains('None.') == False)]
pd.DataFrame(df['binding'].value_counts())

Unnamed: 0,binding
Softbound.,55
Softbound (no hardbound issued).,55
"Hardbound in dustjacket, and softbound.",18
Hardbound in dustjacket.,17
Staplebound (no hardbound issued).,12
Hardbound in dustjacket (no softbound issued).,7
Broadside.,7
"Hardbound, issued without dustjacket.",5
Staplebound.,5
Softbound in dustjacket (no hardbound issued).,4


## Prepare dict to iterate over

instances = df.where((pd.notnull(df)), None).to_dict('records')

## Add arm:Binding triples

In [266]:
for inst in instances:
    name = inst['id']
    file = f"edited-ttl/instance/{name}.ttl"
    h = initialize_abr_graph()
    h.parse(file, format='n3')
    binding = BNode()
    uri = URIRef(inst['instance'])
    h.add((uri, DCTERMS.hasPart, binding))
    h.add((binding, RDF.type, arm.Binding))
    note = BNode()
    h.add((binding, bf.note, note))
    h.add((note, RDF.type, bf.Note))
    h.add((note, RDF.type, arm.DescriptiveNote))
    h.add((note, RDF.value, Literal(inst['binding'])))
    h.serialize(file, format='turtle')

## Remove old note triples

In [272]:
for inst in instances:
    name = inst['id']
    file = f"edited-ttl/instance/{name}.ttl"
    h = initialize_abr_graph()
    h.parse(file, format='n3')
    uri = URIRef(inst['instance'])
    for note in h.objects(uri, bf.note):
        text = h.value(note, RDF.value)
        if text.toPython() == inst['binding']:
            h.remove((uri, bf.note, note))
            h.remove((note, None, None))
    h.serialize(file, format='turtle')