## Load packages, initialize sentence tokenization resource

In [1]:
import numpy as np, pandas as pd, rdflib, re, nltk, collections
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal
from slugify import slugify
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Load CSV file

In [2]:
df = pd.read_csv('csv/C.csv')

## Extract entity names and features

### Normalize all whitespace to spaces in notes

In [3]:
df['description'] = df['description'].str.replace('\s', ' ')

### df:title

In [4]:
df['title'] = df['description'].str.extract('^(.*\))\.', expand=False)

### df:date

In [5]:
df['date'] = df['title'].str.extract('([12][90]\d\d)', expand=False)

### df:identifiedBy

In [6]:
df['mnmid'] = df['description'].str.extract('\{M\&M (.*)\}', expand=False)

In [7]:
df['abrid'] = df['work']

### df:note

In [8]:
df['note'] = df['description'].str.extract('^.*\)\. (.*)$', expand=False)

## Generate BIBFRAME Turtle files

### Initialize namespaces

In [9]:
abrc = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/classification/")
abri = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

### Define function to initialize graphs

In [10]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abrc", "https://w3id.org/anything-but-routine/4.0/classification/")
    g.bind("abri", "https://w3id.org/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://w3id.org/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

### Generate instances

In [11]:
%%time
instances = df.where((pd.notnull(df)), None).to_dict('records')
work_to_instance_map = collections.defaultdict(list)

for i in instances:
    g = initialize_abr_graph()
    name = i['abrid']
    work_to_instance_map[i['abrid']].append(name)
    id = abri[name]
    g.add((id, RDF.type, bf.Instance))
    g.add((id, RDFS.label, Literal(i['title'])))
    g.add((id, bf.instanceOf, abrw[i['work']]))
    # bf:classification
    g.add((id, bf.classification, abrc['C']))
    # bf.contributor
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person))
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    # bf:title
    title = BNode()
    g.add((id, bf.title, title))
    g.add((title, RDF.type, bf.Title))
    g.add((title, RDFS.label, Literal(i['title'])))
    # bf:identifiedBy
    schottlaender_id = BNode()
    g.add((id, bf.identifiedBy, schottlaender_id))
    g.add((schottlaender_id, RDF.type, bf.Identifier))
    g.add((schottlaender_id, bf.source, Literal("Schottlaender v4.0")))
    g.add((schottlaender_id, RDF.value, Literal(i['abrid'])))
    m_n_m = i['mnmid']
    if m_n_m:
        m_n_m_id = BNode()
        g.add((id, bf.identifiedBy, m_n_m_id))
        g.add((m_n_m_id, RDF.type, bf.Identifier))
        g.add((m_n_m_id, bf.source, Literal("Maynard & Miles")))
        g.add((m_n_m_id, RDF.value, Literal(m_n_m)))
    # bf:note
    note = i['note']
    if note:
        sentences = nltk.sent_tokenize(note)
        for sentence in sentences:
            n = BNode()
            g.add((id, bf.note, n))
            g.add((n, RDF.type, bf.Note))
            g.add((n, RDF.value, Literal(sentence)))
            # Contributors extracted from note
            tokenized = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(tokenized)
            namedEnt = nltk.ne_chunk(tagged)
            for t in namedEnt.subtrees():
                if t.label() == 'PERSON':
                    person_tokens = [ c[0] for c in t ]
                    person_name = ' '.join(person_tokens)
                    person = BNode()
                    g.add((id, bf.contributor, person))
                    g.add((person, RDF.type, bf.Agent))
                    g.add((person, RDF.type, bf.Person)) 
                    g.add((person, bf.role, Literal("contributor")))
                    g.add((person, RDFS.label, Literal(person_name)))
    # bf:provisionActivity
    date = i['date']
    if date:
        pa_lit = BNode()
        g.add((id, bf.provisionActivity, pa_lit))
        g.add((pa_lit, RDF.type, bf.ProvisionActivity))
        g.add((pa_lit, RDF.type, bf.Publication))
        g.add((pa_lit, bf.date, Literal(date)))
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')

CPU times: user 23.6 s, sys: 2.06 s, total: 25.6 s
Wall time: 1min 24s


### Generate works

In [14]:
works = pd.DataFrame(df[['title', 'abrid']]).drop_duplicates().to_dict('records')

for work in works:
    g = initialize_abr_graph()
    name = work['abrid']
    id = abrw[name]
    g.add((id, RDF.type, bf.Work))
    g.add((id, RDF.type, bf.Text))
    g.add((id, RDFS.label, Literal(work['title'])))
    g.add((id, bf.classification, abrc['C']))
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person)) 
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    for inst in work_to_instance_map[name]:
        g.add((id, bf.hasInstance, abri[inst]))
    g.serialize(f"ttl/work/{name}.ttl", format='turtle')