## Load packages

In [1]:
import numpy as np, pandas as pd, rdflib, re
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal

## Initialize namespaces

In [2]:
ABRC = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/classification/")
ABRI = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/instance/")
ABRW = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/work/")
BF = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
ARM = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

## Define function to initialize graphs

In [3]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abrc", "https://wsburroughs.link/anything-but-routine/4.0/classification/")
    g.bind("abri", "https://wsburroughs.link/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://wsburroughs.link/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

## Define function to extract instance text into dictionary

In [4]:
def process_text(filename):
    file = open(filename,'r')
    text = file.read()
    ids = [ match.group(1) for match in re.finditer('\n(C\d+)\..*\n', text) ]
    text2 = re.sub(r'\d\s\d\s\d\s\d', '', text)
    text3 = re.sub(r'\s(\d{2,3})\n\n', '', text2)
    descriptions = re.split(r'\s*C\d+\.\s+', text3)[1:]
    records = []
    for id, desc in zip(ids, descriptions):
        records.append(process_text_record(id, desc))
    return records

def process_text_record(id, desc):
    #txt = re.sub('\n+', '\n', desc)
    txt = re.sub('\n+', '', desc)
    txt = re.sub(r' +', ' ', txt)
    txt = re.sub('\uf02d', '-', txt)
    txt = re.sub('\uf0be', '-', txt)
    break_idx = re.search(r'[\uf0d8\uf0b7\uf0a7]', txt).start()
    label = re.sub('\n', ' ', txt[:break_idx]).strip()
    m = re.match('(.+) [\[\(](.*)[\]\)]', label)
    title = m.group(1)
    date = m.group(2)
    tmp = re.sub('\n', ' ', txt[break_idx:]).strip()
    tmp = re.sub(' \uf0d8', '\n\uf0d8', tmp)
    tmp = re.sub(' \uf0b7', '\n\uf0b7', tmp)
    tmp = re.sub(' \uf0a7', '\n\uf0b7', tmp)       
    state = 0
    notes = []
    parts = []
    for bullet in tmp.split('\n'):
        str = bullet[1:].strip()
        if bullet.startswith('\uf0d8'):
            if state == 0:
                notes.append(str)
            else:
                parts[-1]['notes'].append(str)
        elif bullet.startswith('\uf0b7'):
            parts.append({'title': str, 'notes': []})
            state = 1
        else:
            pass  
    return { 'id': id, 'text': txt, 'desc': tmp, 'raw_desc': desc, 'title': title, 
             'label': label, 'date': date, 'notes': notes, 'parts': parts }    


## Define functions for creating entities in graphs

In [5]:
def person(graph, agent="William S. Burroughs", role="author"):
    p = BNode()
    graph.add((p, RDF.type, BF.Agent))
    graph.add((p, RDF.type, BF.Person))
    graph.add((p, BF.role, Literal(role)))
    graph.add((p, RDFS.label, Literal(agent)))
    return p

def title(graph, title):
    t = BNode()
    graph.add((t, RDF.type, BF.Title))
    graph.add((t, RDFS.label, Literal(title)))
    return t
    
def identifier(graph, identifier, source="Schottlaender 4.0"):
    i = BNode()
    graph.add((i, RDF.type, BF.Identifier))
    graph.add((i, BF.source, Literal(source)))
    graph.add((i, RDF.value, Literal(identifier)))
    return i
    
def publication(graph, date, agent=None, place=None):
    p = BNode()
    graph.add((p, RDF.type, BF.ProvisionActivity))
    graph.add((p, RDF.type, BF.Publication))
    graph.add((p, BF.date, Literal(date)))
    if agent:
        graph.add((p, BF.agent, Literal(agent)))
    if place:
        graph.add((p, BF.place, Literal(place)))
    return p

def note(graph, note):
    n = BNode()
    graph.add((n, RDF.type, BF.Note))
    graph.add((n, RDF.value, Literal(note)))
    return n

def text(graph, title):
    a = BNode()
    graph.add((a, RDF.type, bf.Instance))
    graph.add((a, RDF.type, bf.Text))
    graph.add((a, RDFS.label, Literal(title)))
    return a

## Define function to generate Turtle files from instance records

In [6]:
def process_record(i):
    g = initialize_abr_graph()
    # URI identifier
    name = i['id']
    id = abri[name]
    g.add((id, RDF.type, BF.Instance))
    g.add((id, RDF.type, BF.Text))
    # rdfs:label
    g.add((id, RDFS.label, Literal(i['title'])))
    # bf:classification
    g.add((id, BF.classification, abrc['C']))
    # bf.contributor
    g.add((id, BF.contributor, person(g, "William S. Burroughs", "author")))
    # bf:title
    g.add((id, BF.title, title(g, i['title'])))
    # bf:identifiedBy
    g.add((id, BF.identifiedBy, identifier(g, name, "Schottlaender v4.0")))
    m_n_m = re.match('.*\{M\&M (.+)\}', i['label'])
    if m_n_m:
        g.add((id, BF.identifiedBy, identifier(g, m_n_m.group(1), "Maynard & Miles")))
    # bf:provisionActivity
    if i['date']:
        g.add((id, BF.provisionActivity, publication(g, i['date'])))
    # bf:note
    for n in i['notes']:
        g.add((id, BF.note, note(g, n)))
    # bf:hasPart
    for part in i['parts']:
        p = text(g, part['title'])
        g.add((id, BF.hasPart, p))
        for n in part['notes']:
            g.add((p, BF.note, Literal(n)))
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')
    return id

## Process text file into Turtle files

In [224]:
filename = 'pdf/C.txt'
for record in process_text(filename):
    process_record(record)

In [252]:
for record in process_text(filename):
    m = re.findall('[\)\]]\. ([^\uf0d8\uf0b7\uf0a7]+).+([\uf0d8\uf0b7\uf0a7])\s+[\uf0d8\uf0b7\uf0a7]', record['text'])
    #m = re.findall('\} ([^\uf0d8\uf0b7\uf0a7]+) ([\uf0d8\uf0b7\uf0a7])\s+[\uf0d8\uf0b7\uf0a7]', record['text'])
    if m:
        print(record['id'], m)

C56 [('{M&M C59} cover: “Summer 1963.”', '\uf0d8')]
C57 [('{M&M C93} cover: “An Odour-Fill [sic] Periodical.”', '\uf0d8')]
C65 [('{M&M C84-86} 1st state has red 2s./6p. price label [BeatBooks 36]; 2nd state was 3s [PBA 198]. ', '\uf0d8')]
C97 [('{M&M C116} ', '\uf0d8')]
C109 [('{M&M C120} ', '\uf0d8')]
C111 [('{M&M C109} ', '\uf0d8')]
C118 [('{M&M C129} Includes “Hommage to William Seward Burroughs” by Philip Whalen.', '\uf0d8')]
C128 [('Includes “Myth-Maker of the 20th Century,” an article about Burroughs by J. G. Ballard.', '\uf0d8')]
C130 [('{M&M C150} Includes “Homage to William Seward Burroughs” by Philip Whalen.', '\uf0d8')]
C162 [('{M&M C174} ', '\uf0d8')]
C325 [('Includes 40 copies with a numbered, signed serigraph by contributor Gil J. Wolman.', '\uf0d8')]
C348 [('{M&M C350–C352} Issue title: “Cut Up.”', '\uf0d8')]
C451 [('Includes “Note sur Cobble Stone Gardens” by J.-F. Chevrier and Philippe Roussin.', '\uf0d8')]
C456 [('Issue title: “Schizo-Culture 1”', '\uf0d8')]
C458 [('I

In [212]:
records = process_text(filename)

In [261]:
r = records[111]

In [262]:
r['text']

'The Moving Times, [No. 5 (1965)]. {M&M C109} \uf0d8 Entire issue “presented as a single-sided poster sheet.” [M&M] \uf0d8 \uf0d8 “... a variant of the full-size poster designed for display on the advertising walls of London’s underground Issued, folded in eighths, as Sigma Portfolio [No. 1] (1965), below. stations.” [BeatBooks 42] \uf0b7 “Martin’s Folly.”'

In [150]:
str = '\ncover: “An Odour-Fill [sic] Periodical.” \n\uf0d8 \n\uf0d8'

In [176]:
p = re.compile('\s')

In [177]:
p.match(r['text'])