## Load packages and set pandas option

In [1]:
import numpy as np, pandas as pd, rdflib, re
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal

## Initialize namespaces

In [2]:
abrc = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/classification/")
abri = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://wsburroughs.link/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

## Define function to initialize graphs

In [3]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abrc", "https://wsburroughs.link/anything-but-routine/4.0/classification/")
    g.bind("abri", "https://wsburroughs.link/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://wsburroughs.link/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

## Define function to extract instance text into dictionary

In [95]:
def process_text(filename):
    file = open(filename,'r')
    text = file.read()
    ids = [ match.group(1) for match in re.finditer('\n(B\d+)\..*\n', text) ]
    text2 = re.sub(r'\d\s\d\s\d\s\d', '', text)
    text3 = re.sub(r'\s(\d{2,3})\n\n', '', text2)
    descriptions = re.split(r'\s*B\d+\.\s+', text3)[1:]
    records = []
    for id, desc in zip(ids, descriptions):
        txt = re.sub('\n+', '\n', desc)
        txt = re.sub(r' +', ' ', txt)
        txt = re.sub('\uf02d', '-', txt)
        records.append({ 'id': id, 'text': txt })
    return records

## Define function to generate Turtle files from instance records

In [142]:
def process_record(i):
    g = initialize_abr_graph()
    name = i['id']
    id = abri[name]
    s0 = re.search(r'[\uf0d8\uf0b7\uf0a7]', i['text'])
    if s0:
        break_idx = s0.start()
        label = re.sub('\n', ' ', i['text'][:break_idx]).strip()
        desc = re.sub('\n', ' ', i['text'][break_idx:]).strip()
        desc = re.sub(' \uf0d8', '\n\uf0d8', desc)
        desc = re.sub(' \uf0b7', '\n\uf0b7', desc)
        desc = re.sub(' \uf0a7', '\n\uf0b7', desc)
    else:
        label = re.sub('\n', ' ', i['text']).strip()
        desc = ""
    title_str = None
    pub_str = None
    binding_str = None
    m1 = re.match('(.+)\. (.+)\. (.+)\.', label) or re.match('(.+)\. (.+; .*)\. (.+)\.', label) 
    if m1:
        title_str = m1.group(1)
        pub_str = m1.group(2)
        binding_str = m1.group(3)
    g.add((id, RDF.type, bf.Instance))
    g.add((id, RDFS.label, Literal(title_str)))
    # bf:classification
    g.add((id, bf.classification, abrc['B']))
    # bf.contributor
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person))
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    # bf:title
    title = BNode()
    g.add((id, bf.title, title))
    g.add((title, RDF.type, bf.Title))
    g.add((title, RDFS.label, Literal(title_str)))
    # bf:identifiedBy
    schottlaender_id = BNode()
    g.add((id, bf.identifiedBy, schottlaender_id))
    g.add((schottlaender_id, RDF.type, bf.Identifier))
    g.add((schottlaender_id, bf.source, Literal("Schottlaender v4.0")))
    g.add((schottlaender_id, RDF.value, Literal(name)))
    m_n_m = re.match('.*\{M\&M (.+)\}', label)
    if m_n_m:
        m_n_m_id = BNode()
        g.add((id, bf.identifiedBy, m_n_m_id))
        g.add((m_n_m_id, RDF.type, bf.Identifier))
        g.add((m_n_m_id, bf.source, Literal("Maynard & Miles")))
        g.add((m_n_m_id, RDF.value, Literal(m_n_m.group(1))))
    # bf:provisionActivity
    if pub_str:
        m2 = re.match('(.+): (.+), (.+)', pub_str)
        if m2:
            place_str = m2.group(1)
            agent_str = m2.group(2)
            date_str = m2.group(3)
            pa_lit = BNode()
            g.add((id, bf.provisionActivity, pa_lit))
            g.add((pa_lit, RDF.type, bf.ProvisionActivity))
            g.add((pa_lit, RDF.type, bf.Publication))
            g.add((pa_lit, bf.agent, Literal(agent_str)))
            g.add((pa_lit, bf.date, Literal(date_str)))
            g.add((pa_lit, bf.place, Literal(place_str)))
    # bf:note
    state = 0
    for bullet in desc.split('\n'):
        if bullet.startswith('\uf0d8'):
            n = BNode()
            g.add((n, RDF.type, bf.Note))
            g.add((n, RDF.value, Literal(bullet[1:].strip())))
            if state == 0:
                # Periodical issue note
                g.add((id, bf.note, n))
            else:
                # Article note
                g.add((a, bf.note, n))
        elif bullet.startswith('\uf0b7'):
            a = BNode()
            g.add((id, RDF.type, bf.Instance))
            g.add((a, RDF.type, bf.Text))
            g.add((a, RDFS.label, Literal(bullet[1:].strip())))
            g.add((id, bf.hasPart, a))
            state = 1
        else:
            pass
    # bf:hasPart :binding
    if binding_str:
        binding_lit = BNode()
        binding_note = BNode()
        g.add((id, bf.hasPart, binding_lit))
        g.add((binding_lit, RDF.type, arm.Binding))
        g.add((binding_lit, bf.note, binding_note))
        g.add((binding_note, RDF.type, bf.Note))
        g.add((binding_note, RDF.type, bf.DescriptiveNote))
        g.add((binding_note, RDF.value, Literal(binding_str)))
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')
    return id

## Process text file into Turtle files

In [143]:
filename = 'pdf/B.txt'
for record in process_text(filename):
    print(record['id'])
    s = process_record(record)

B1
B2
B3
B4
B5
B6
B7
B8
B9
B10
B11
B12
B13
B14
B15
B16
B17
B18
B19
B20
B21
B22
B23
B24
B25
B26
B27
B28
B29
B30
B31


In [8]:
for i in process_text(filename):
    m = re.findall('[\uf0d8\uf0b7\uf0a7]\s+[\uf0d8\uf0b7\uf0a7]', i['text'])
    if m:
        print(i['id'])

In [60]:
for i in records:
    m = re.findall('rdf\:value \"\"', i['text'])
    if m:
        print(i['id'])

In [90]:
records = process_text(filename)

In [105]:
record = records[4]
record

{'id': 'B5',
 'text': 'William S. Burroughs. London: The October Gallery, 1988. \nA. Single sheet of cardstock, folded to make 8 panels (4 on each side). \n\uf0d8 Brochure accompanying one-man exhibition at October Gallery in London, [“Paintings and Drawings, \n\uf0d8 \n\uf0d8 \n1 June–2nd July 1988”]. \nIncludes “Excerpts from the essay ‘On Burroughs’ Art’ by James Grauerholz, April 28, 1988.” \nIncludes reproductions of three paintings by Burroughs: Through a Fish Eye, Burn Unit, and Fear Death by \nWater. \nB. [Press Release for and list of works from exhibition] \n\uf0d8 \nIncludes untitled offprint of “On Burroughs’ Art” by James Grauerholz and photocopy of 1988 British \nnewspaper article about Burroughs. \nC. [Promotional Postcard]. London: The October Gallery, 1988. 6 x 4⅛ in. postcard. \n\uf0d8 B&W photograph of Burroughs by John Minihan on recto. \n \n \n \n\x0c25'}

In [106]:
break_idx = re.search(r'[\uf0d8\uf0b7\uf0a7]', record['text']).start()
label = re.sub('\n', ' ', record['text'][:break_idx]).strip()
desc = re.sub('\n', ' ', record['text'][break_idx:]).strip()
desc = re.sub(' \uf0d8', '\n\uf0d8', desc)
desc = re.sub(' \uf0b7', '\n\uf0b7', desc)
desc = re.sub(' \uf0a7', '\n\uf0b7', desc)

In [107]:
record['text'][:break_idx]

'William S. Burroughs. London: The October Gallery, 1988. \nA. Single sheet of cardstock, folded to make 8 panels (4 on each side). \n'

In [108]:
label

'William S. Burroughs. London: The October Gallery, 1988.  A. Single sheet of cardstock, folded to make 8 panels (4 on each side).'

In [109]:
m = re.match('(.+)\. (.+)\. (.+)\.', label)

In [110]:
desc = m.group(0)

In [111]:
desc

'William S. Burroughs. London: The October Gallery, 1988.  A. Single sheet of cardstock, folded to make 8 panels (4 on each side).'

In [112]:
re.sub(' +', ' ', desc)

'William S. Burroughs. London: The October Gallery, 1988. A. Single sheet of cardstock, folded to make 8 panels (4 on each side).'

In [113]:
title = m.group(1)

In [114]:
publication = m.group(2).strip()

In [115]:
binding = m.group(3)

In [116]:
title

'William S. Burroughs. London: The October Gallery, 1988'

In [117]:
publication

'A'

In [118]:
binding

'Single sheet of cardstock, folded to make 8 panels (4 on each side)'

In [119]:
m2 = re.match('(.+): (.+), (.+)', publication)

In [120]:
pub_place = m2.group(1)

AttributeError: 'NoneType' object has no attribute 'group'

In [77]:
pub_agent = m2.group(2)

In [78]:
pub_date = m2.group(3)

In [79]:
pub_place

'Paris'

In [80]:
pub_agent

'Galerie Stadler'

In [81]:
pub_date

'1964'

In [87]:
m_n_m = re.match('.*\{M\&M (.+)\}', label)

In [88]:
m_n_m.group(1)

'F11'

In [152]:
def work(record):
    s0 = re.search(r'[\uf0d8\uf0b7\uf0a7]', record['text'])
    if s0:
        break_idx = s0.start()
        w = re.sub('\n', ' ', record['text'][:break_idx]).strip()
        instances = re.sub('\n', ' ', record['text'][break_idx:]).strip()
        instances = re.sub(' \uf0d8', '\n\uf0d8', instances)
        instances = re.sub(' \uf0b7', '\n\uf0b7', instances)
        instances = re.sub(' \uf0a7', '\n\uf0b7', instances)
    else:
        w = re.sub('\n', ' ', i['text']).strip()
        instances = ""
    return w, instances


In [153]:
filename = 'pdf/B.txt'
for record in process_text(filename):
    w, instances = work(record)
    print(f'{record["id"]}: "{w}", "{instances}"')

B1: "Peinture, Poésie, Musique: David Budd Recontree William Burroughs et Earl Brown Chez Rodolphe Stadler.  Paris: Galerie Stadler, 1964. Single sheet, folded to make 12 panels (6 on each side).  {M&M F11}", " “A two-column piece by Burroughs printed alternately in red and orange ink occupies half of the catalogue,  both in English and in a French translation.” [BeatBooks 39]"
B2: "Ruby Editions Portfolio, One. With Cozette de Charmoy and Henri Chopin. Designed by Henri  Chopin. London: Wallrich Books, 1974. 3 sheets and vellum title page in printed cardstock  folder.", " cover title: Ruby Editions Portfolio 1. 
 Portfolio containing three prints, one each by Burroughs, de Charmoy, and Chopin. 
 “This edition consists of One Hundred numbered copies, and Thirty [numbered] copies Hors Commerce; each  print signed by the [respective] artist.”"
B3: "William Burroughs: Painting. Amsterdam: Suzanne Biederberg Gallery; London: October Gallery,  [1988]. Softbound (no hardbound issued)", "

NameError: name 'i' is not defined