## Load packages and set pandas option

In [1]:
import numpy as np, pandas as pd, rdflib, re
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal

## Initialize namespaces

In [2]:
abrc = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/classification/")
abri = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

## Define function to initialize graphs

In [3]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abrc", "https://wsburroughs.link/anything-but-routine/4.0/classification/")
    g.bind("abri", "https://wsburroughs.link/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://wsburroughs.link/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

## Define function to extract instance text into dictionary

In [4]:
def process_text(filename):
    file = open(filename,'r')
    text = file.read()
    ids = [ match.group(1) for match in re.finditer('\n(C\d+)\..*\n', text) ]
    text2 = re.sub(r'\d\s\d\s\d\s\d', '', text)
    text3 = re.sub(r'\s(\d{2,3})\n\n', '', text2)
    descriptions = re.split(r'\s*C\d+\.\s+', text3)[1:]
    records = []
    for id, desc in zip(ids, descriptions):
        txt = re.sub('\n+', '\n', desc)
        txt = re.sub(r' +', ' ', txt)
        txt = re.sub('\uf02d', '-', txt)
        records.append({ 'id': id, 'text': txt })
    return records

## Define function to generate Turtle files from instance records

In [5]:
def process_record(i):
    g = initialize_abr_graph()
    name = i['id']
    id = abri[name]
    break_idx = re.search(r'[\uf0d8\uf0b7\uf0a7]', i['text']).start()
    label = re.sub('\n', ' ', i['text'][:break_idx]).strip()
    desc = re.sub('\n', ' ', i['text'][break_idx:]).strip()
    desc = re.sub(' \uf0d8', '\n\uf0d8', desc)
    desc = re.sub(' \uf0b7', '\n\uf0b7', desc)
    desc = re.sub(' \uf0a7', '\n\uf0b7', desc)
    m = re.match('(.+) [\[\(](.*)[\]\)]', label)
    title_str = m.group(1)
    date_str = m.group(2)
    g.add((id, RDF.type, bf.Instance))
    g.add((id, RDFS.label, Literal(title_str)))
    # bf:classification
    g.add((id, bf.classification, abrc['C']))
    # bf.contributor
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person))
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    # bf:title
    title = BNode()
    g.add((id, bf.title, title))
    g.add((title, RDF.type, bf.Title))
    g.add((title, RDFS.label, Literal(title_str)))
    # bf:identifiedBy
    schottlaender_id = BNode()
    g.add((id, bf.identifiedBy, schottlaender_id))
    g.add((schottlaender_id, RDF.type, bf.Identifier))
    g.add((schottlaender_id, bf.source, Literal("Schottlaender v4.0")))
    g.add((schottlaender_id, RDF.value, Literal(name)))
    m_n_m = re.match('.*\{M\&M (.+)\}', label)
    if m_n_m:
        m_n_m_id = BNode()
        g.add((id, bf.identifiedBy, m_n_m_id))
        g.add((m_n_m_id, RDF.type, bf.Identifier))
        g.add((m_n_m_id, bf.source, Literal("Maynard & Miles")))
        g.add((m_n_m_id, RDF.value, Literal(m_n_m.group(1))))
    # bf:provisionActivity
    if date_str:
        pa_lit = BNode()
        g.add((id, bf.provisionActivity, pa_lit))
        g.add((pa_lit, RDF.type, bf.ProvisionActivity))
        g.add((pa_lit, RDF.type, bf.Publication))
        g.add((pa_lit, bf.date, Literal(date_str)))
    # bf:note
    state = 0
    for bullet in desc.split('\n'):
        if bullet.startswith('\uf0d8'):
            n = BNode()
            g.add((n, RDF.type, bf.Note))
            g.add((n, RDF.value, Literal(bullet[1:].strip())))
            if state == 0:
                # Periodical issue note
                g.add((id, bf.note, n))
            else:
                # Article note
                g.add((a, bf.note, n))
        elif bullet.startswith('\uf0b7'):
            a = BNode()
            g.add((id, RDF.type, bf.Instance))
            g.add((a, RDF.type, bf.Text))
            g.add((a, RDFS.label, Literal(bullet[1:].strip())))
            g.add((id, bf.hasPart, a))
            state = 1
        else:
            pass
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')
    return id

## Process text file into Turtle files

In [6]:
filename = 'pdf/C.txt'
for record in process_text(filename):
    s = process_record(record)

In [8]:
for record in process_text(filename):
    m = re.findall('[\uf0d8\uf0b7\uf0a7]\s+[\uf0d8\uf0b7\uf0a7]', record['text'])
    if m:
        print(record['id'])

C56
C57
C65
C97
C109
C111
C118
C128
C130
C162
C291
C325
C348
C451
C456
C458
C505
C507
C508
C515
C521
C524
C525
C528
C532
C536
C542
C545
C548
C553
C572
C576
C578
C579
C584
C602
C610
C612
C641
C662
