## Load packages

In [110]:
%matplotlib inline 
import rdflib, glob, re, nltk, pandas as pd
from rdflib import URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, DCTERMS
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Define namespaces

In [111]:
abri = rdflib.Namespace("http://bradleypallen.org/anything-but-routine-ld/4.0/instance/")
abrw = rdflib.Namespace("http://bradleypallen.org/anything-but-routine-ld/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

## Parse all edited .ttl files into a graph

In [112]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abri", "http://bradleypallen.org/anything-but-routine-ld/4.0/instance/")
    g.bind("abrw", "http://bradleypallen.org/anything-but-routine-ld/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

In [113]:
g = initialize_abr_graph()
for infile in glob.glob("edited-ttl/*/*.ttl"):
    g.parse(infile, format='n3')
n = len(g)
print(f"ABR graph has {n} triples.")

ABR graph has 9339 triples.


## Build dataframe with instance promotional materials from notes

In [149]:
materials = []

for instance in g.subjects(RDF.type, bf.Instance):
    inst_uri = instance.toPython()
    inst_label = g.value(instance, RDFS.label)
    inst_id = inst_uri[inst_uri.rfind('/')+1:]
    for note in g.objects(instance, bf.note):
        text = g.value(note, RDF.value).toPython()
        if re.search('^1[ab]?. \[', text):
            materials.append([inst_uri, inst_label, inst_id, text])
                
df = pd.DataFrame(materials, columns=['instanceuri', 'instancelabel', 'instanceid', 'description'])

## Extract instance attributes and relations

### URI

In [150]:
df['pmuri'] = df['instanceuri'] + df['description'].str.extract('^(\d[ab]?).', expand=False)

### bf:identifiedBy

In [151]:
df['pmid'] = df['instanceid'] + df['description'].str.extract('^(\d[ab]?).', expand=False)
df['MaynardAndMiles'] = df['description'].str.extract('\{M\&M (\S*)\}', expand=False)

### bf:provisionActivity

In [152]:
publication_ptrn = ' (\[?[\s\w\-\.\, ]+\]?: \[?[\s\w\-\.\, ]+\]?\, \[?\d\d\d\d\??\]?)'
df['publication'] = df['description'].str.extract(publication_ptrn, expand=False)
df['place'] = df['publication'].str.extract('(.*):', expand=False)
df['agent'] = df['publication'].str.extract(': (.*),', expand=False)
df['date'] = df['publication'].str.extract(', (\[?\d\d\d\d\??\]?)', expand=False)

### Type of promotional material

In [153]:
df['type'] = df['description'].str.extract('^\d[ab]?\. \[([ \w]+)\]', expand=False)

### dcterms:hasPart arm:Binding

In [154]:
df['binding'] = df['description'].str.extract('(Pamphlet)', expand=False)

## Prepare dict to iterate over

In [156]:
instances = df.where((pd.notnull(df)), None).to_dict('records')

## Add bf:Instances for promotional materials

In [170]:
for inst in instances:
    name = inst['pmid']
    #file = f"edited-ttl/instance/{name}.ttl"
    file = f"pmat-test/{name}.ttl"
    h = initialize_abr_graph()
    uri = URIRef(inst['pmuri'])
    h.add((uri, RDF.type, bf.Instance))
    
    # rdfs:label
    label = inst['type'] + ' for ' + inst['instancelabel']
    h.add((uri, RDFS.label, Literal(label)))

    # bf.note
    sentences = nltk.sent_tokenize(inst['description'])
    for sentence in sentences:
        note = BNode()
        h.add((uri, bf.note, note))
        h.add((note, RDF.type, bf.Note))
        h.add((note, RDF.value, Literal(sentence)))

    # bf.contributor
    wsb = BNode()
    h.add((uri, bf.contributor, wsb))
    h.add((wsb, RDF.type, bf.Agent))
    h.add((wsb, RDF.type, bf.Person)) 
    h.add((wsb, bf.role, Literal("author")))
    h.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    tokenized = nltk.word_tokenize(inst['description'])
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged)
    for t in namedEnt.subtrees():
        if t.label() == 'PERSON':
            person_tokens = [ c[0] for c in t ]
            person_name = ' '.join(person_tokens)
            person = BNode()
            h.add((uri, bf.contributor, person))
            h.add((person, RDF.type, bf.Agent))
            h.add((person, RDF.type, bf.Person)) 
            h.add((person, bf.role, Literal("contributor")))
            h.add((person, RDFS.label, Literal(person_name)))
            
    # bf:identifiedBy
    identifier = BNode()
    h.add((uri, bf.identifiedBy, identifier))
    h.add((identifier, RDF.type, bf.Identifier))
    h.add((identifier, bf.source, Literal("Schottlaender v4.0")))
    h.add((identifier, RDF.value, Literal(inst['pmid'])))
    if inst['MaynardAndMiles'] is not None:
        m_n_m_id = BNode()
        h.add((uri, bf.identifiedBy, m_n_m_id))
        h.add((m_n_m_id, RDF.type, bf.Identifier))
        h.add((m_n_m_id, bf.source, Literal("Maynard & Miles")))
        h.add((m_n_m_id, RDF.value, Literal(inst['MaynardAndMiles'])))
        
    # bf:provisionActivity
    activity = BNode()
    h.add((uri, bf.provisionActivity, activity))
    h.add((activity, RDF.type, bf.ProvisionActivity))
    h.add((activity, RDF.type, bf.Publication))
    h.add((activity, bf.agent, Literal(inst['agent'])))
    h.add((activity, bf.date, Literal(inst['date'])))
    h.add((activity, bf.place, Literal(inst['place'])))

    # bf.relatedTo
    h.add((uri, bf.relatedTo, URIRef(inst['instanceuri'])))
    
    # dcterms:hasPart
    if inst['binding']:
        binding = BNode()
        h.add((uri, DCTERMS.hasPart, binding))
        h.add((binding, RDF.type, arm.Binding))
        note = BNode()
        h.add((binding, bf.note, note))
        h.add((note, RDF.type, bf.Note))
        h.add((note, RDF.type, arm.DescriptiveNote))
        h.add((note, RDF.value, Literal(f"{inst['binding']}.")))
        h.serialize(file, format='turtle')

    h.serialize(file, format='turtle')

## Remove old note triples

In [171]:
for inst in instances:
    name = inst['instanceid']
    file = f"edited-ttl/instance/{name}.ttl"
    h = initialize_abr_graph()
    h.parse(file, format='n3')
    uri = URIRef(inst['instanceuri'])
    h.add((uri, bf.relatedTo, URIRef(inst['pmuri'])))
    for note in h.objects(uri, bf.note):
        text = h.value(note, RDF.value)
        if text.toPython() == inst['description']:
            h.remove((uri, bf.note, note))
            h.remove((note, None, None))
    h.serialize(file, format='turtle')