## Load packages, initialize sentence tokenization resource

In [1]:
import numpy as np, pandas as pd, rdflib, re, nltk, collections
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal
from slugify import slugify
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load CSV file for ABR A list

In [2]:
df = pd.read_csv('Schottlaender v4.0 - A.csv')

## Extract entity names and features

### Normalize all whitespace to spaces in notes

In [3]:
df['notes'] = df['notes'].str.replace('\s', ' ')

### Places

In [4]:
df['place'] = df['publication'].str.extract('(.+)\:', expand=False)

### Publishers

In [5]:
df['publisher'] = df['publication'].str.extract('\: (.+)', expand=False)

### Bindings

In [6]:
df['binding'] = df['notes'].str.extract('(\S+bound[^.]*)\.', expand=False)
df['binding'] = df['binding'].str.strip()

In [7]:
df['binding'].value_counts().sort_index()

Hardbound                                                                                                1
Hardbound (issued without dustjacket)                                                                    2
Hardbound (issued without dustjacket), and softbound                                                     4
Hardbound in dustjacket                                                                                 17
Hardbound in dustjacket (no softbound issued)                                                            7
Hardbound in dustjacket, and softbound                                                                  19
Hardbound in dustjacket, issued with compact disc bound in rear                                          1
Hardbound in tissue dustjacket, and softbound                                                            1
Hardbound with hand-made orange tissue guards inserted                                                   1
Hardbound, and softbound             

### Copyright dates

In [8]:
df['copyrightDate'] = df['notes'].str.extract('©(\w+)', expand=False)

### M&M instance classification

In [9]:
df['M&M'] = df['notes'].str.extract('\{M\&M(.*)\}', expand=False)

### Promotional materials

In [10]:
promotionalMaterials = pd.DataFrame(df[df['notes'].str.contains(' 1. \[', na = False)])

In [11]:
promotionalMaterials

Unnamed: 0,work,workTitle,instance,instanceTitle,date,publication,notes,place,publisher,binding,copyrightDate,M&M
12,A2,Naked Lunch,B,Naked Lunch,[1962],New York: Grove Press,©1959. Hardbound in dustjacket. {M&M A2b} “Th...,New York,Grove Press,Hardbound in dustjacket,1959.0,A2b} “The text in this edition differs quite...
95,A24,Electronic Revolution 1970-71,A,Electronic Revolution 1970-71,1971,Cambridge: Blackmoor Head Press,In English and French; French translation by J...,Cambridge,Blackmoor Head Press,Softbound (no hardbound issued),,A21a
96,A24,Electronic Revolution 1970-71,B,Die elektronische Revolution = Electronic Revo...,1972,Göttingen: Expanded Media Editions,In German and English. German translation by C...,Göttingen,Expanded Media Editions,Softbound in dustjacket (no hardbound issued),,
140,A42,Doctor Benway: A [Variant] Passage from The Na...,A,Doctor Benway: A [Variant] Passage from The Na...,1979,"Santa Barbara, Calif.: Bradford Morrow",With a new introduction by Burroughs. Hardboun...,"Santa Barbara, Calif.",Bradford Morrow,"Hardbound in dustjacket, and softbound",,
160,A54,The Burroughs File,A,The Burroughs File,1984,San Francisco: City Lights Books,"Hardbound, and softbound. “Including complete ...",San Francisco,City Lights Books,"Hardbound, and softbound",,
162,A56,The Place of Dead Roads,A,The Place of Dead Roads,1984,"New York: Holt, Rinehart, and Winston",©1983. Hardbound in dustjacket. “The original ...,New York,"Holt, Rinehart, and Winston",Hardbound in dustjacket,1983.0,
174,A59,Queer,A,Queer,1985,New York: Viking/Penguin,Written in 1953. Hardbound in dustjacket. “Wit...,New York,Viking/Penguin,Hardbound in dustjacket,,
190,A65,Tornado Alley,A,Tornado Alley,1989,"Cherry Valley, N.Y.: Cherry Valley Editions",Tornado Alley. Illustrations by S. Clay Wilson...,"Cherry Valley, N.Y.",Cherry Valley Editions,"Hardbound (issued without dustjacket), and sof...",,
191,A66,Ghost of Chance,A,Ghost of Chance,1991,New York: Library Fellows of the Whitney Museu...,[Edited by James Grauerholz.] Illustrated by G...,New York,Library Fellows of the Whitney Museum of Ameri...,Hardbound; handbound (issued without dustjacke...,,
192,A66,Ghost of Chance,B,Ghost of Chance,1995,New York: Serpent’s Tail/High Risk Books,"Hardbound, issued without dustjacket. “First p...",New York,Serpent’s Tail/High Risk Books,"Hardbound, issued without dustjacket",,


In [12]:
df[df['notes'].str.contains("[Pp]amphlet", na = False)]

Unnamed: 0,work,workTitle,instance,instanceTitle,date,publication,notes,place,publisher,binding,copyrightDate,M&M
12,A2,Naked Lunch,B,Naked Lunch,[1962],New York: Grove Press,©1959. Hardbound in dustjacket. {M&M A2b} “Th...,New York,Grove Press,Hardbound in dustjacket,1959.0,A2b} “The text in this edition differs quite...
78,A16,Valentines Day Reading,A,Valentines Day Reading,1965,New York: American Theatre for Poets,Staplebound. {M&M F12} A mimeographed pamphlet...,New York,American Theatre for Poets,Staplebound,,F12
79,A17,So Who Owns Death TV?,A,So Who Owns Death TV?,1967,"San Francisco: Beach Books, Texts, & Documents",With Claude Pélieu and Carl Weissner. (A Black...,San Francisco,"Beach Books, Texts, & Documents",Staplebound (no hardbound issued),,A13a
80,A17,So Who Owns Death TV?,B,So Who Owns Death TV?,1967,"San Francisco: Beach Books, Texts, & Documents",[2nd expanded ed.] (A Black Bag Pamphlet) Stap...,San Francisco,"Beach Books, Texts, & Documents",Staplebound (no hardbound issued),,A13b
84,A19,Scientology Revisited,A,Scientology Revisited,1968?,[n.p.],Staplebound pamphlet. Four-color glossy wraps...,,,Staplebound pamphlet,,
85,A19,Scientology Revisited,B,Scientology Revisited,1969?,[n.p.],Staplebound pamphlet. B&W matte wraps; 1 stapl...,,,Staplebound pamphlet,,
87,A21,The Dead Star,A,The Dead Star,1969,San Francisco: Nova Broadcast Press,"(Nova Broadcast, No. 5) Distributed by City Li...",San Francisco,Nova Broadcast Press,Staplebound (no hardbound issued),1969.0,A14a
96,A24,Electronic Revolution 1970-71,B,Die elektronische Revolution = Electronic Revo...,1972,Göttingen: Expanded Media Editions,In German and English. German translation by C...,Göttingen,Expanded Media Editions,Softbound in dustjacket (no hardbound issued),,
186,A63,Clause 27 Is Proposition 6 Is the Whole Tamale,A,Clause 27 Is Proposition 6 Is the Whole Tamale,[1989?],[n.p.]: The Horse Press,Pamphlet. Issued with two different cover colo...,[n.p.],The Horse Press,,,
191,A66,Ghost of Chance,A,Ghost of Chance,1991,New York: Library Fellows of the Whitney Museu...,[Edited by James Grauerholz.] Illustrated by G...,New York,Library Fellows of the Whitney Museum of Ameri...,Hardbound; handbound (issued without dustjacke...,,


In [13]:
df[df['notes'].str.contains("paperbound ed", na = False)]

Unnamed: 0,work,workTitle,instance,instanceTitle,date,publication,notes,place,publisher,binding,copyrightDate,M&M
90,A22,The Last Words of Dutch Schultz,C,The Last Words of Dutch Schultz: A Fiction in ...,1981,New York: Seaver Books,1st paperbound ed. ©1975; distributed by Grove...,New York,Seaver Books,paperbound ed,1975,


## Generate BIBFRAME Turtle files

### Initialize namespaces

In [14]:
abri = rdflib.Namespace("https://w3id.org/schottlaender/4.0/instance/")
abrw = rdflib.Namespace("https://w3id.org/schottlaender/4.0/work/")
abra = rdflib.Namespace("https://w3id.org/schottlaender/4.0/agent/")
abrp = rdflib.Namespace("https://w3id.org/schottlaender/4.0/place/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/")

### Define function to initialize graphs

In [15]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abri", "https://w3id.org/schottlaender/4.0/instance/")
    g.bind("abrw", "https://w3id.org/schottlaender/4.0/work/")
    g.bind("abra", "https://w3id.org/schottlaender/4.0/agent/")
    g.bind("abrp", "https://w3id.org/schottlaender/4.0/place/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/")
    return g

### Define dict to map strings to relative URIs

In [16]:
label_to_relative_uri_map = {}

### Generate publishers

In [17]:
publishers = df['publisher'].fillna(value=' [n.p.]').unique().tolist()

for name in publishers:
    g = initialize_abr_graph()
    slug = slugify(name)
    label_to_relative_uri_map[name] = slug
    id = abra[slug]
    g.add((id, RDF.type, bf.Agent))
    g.add((id, RDF.type, bf.Organization))
    g.add((id, RDFS.label, Literal(name)))
    g.serialize(f"ttl/agent/{slug}.ttl", format='turtle')

### Generate places

In [18]:
places = df['place'].fillna(value=' [n.p.]').unique().tolist()

for name in places:
    g = initialize_abr_graph()
    slug = slugify(name)
    label_to_relative_uri_map[name] = slug
    id = abrp[slug]
    g.add((id, RDF.type, bf.Place))
    g.add((id, RDFS.label, Literal(name)))
    g.serialize(f"ttl/place/{slug}.ttl", format='turtle')

### Generate instances

In [19]:
instances = df.where((pd.notnull(df)), None).to_dict('records')
work_to_instance_map = collections.defaultdict(list)

for i in instances:
    g = initialize_abr_graph()
    name = i['work'] + i['instance'].lower()
    work_to_instance_map[i['work']].append(name)
    id = abri[name]
    g.add((id, RDF.type, bf.Instance))
    g.add((id, RDFS.label, Literal(i['workTitle'])))
    g.add((id, bf.instanceOf, abrw[i['work']]))
    g.add((id, bf.contributor, abra['burroughs-william-s']))
    # bf:title
    title = BNode()
    g.add((id, bf.title, title))
    g.add((title, RDF.type, bf.Title))
    g.add((title, RDFS.label, Literal(i['instanceTitle'])))
    # bf:note
    notes = i['notes']
    if notes:
        notes_text = nltk.sent_tokenize(notes)
        for sentence in notes_text:
            note = BNode()
            g.add((id, bf.note, note))
            g.add((note, RDF.type, bf.Note))
            g.add((note, RDF.value, Literal(sentence)))
    # bf:provisionActivity
    publisher = i['publisher']
    date = i['date']
    place = i['place']
    if publisher and date and place:
        pa_lit = BNode()
        g.add((id, bf.provisionActivity, pa_lit))
        g.add((pa_lit, RDF.type, bf.ProvisionActivity))
        g.add((pa_lit, RDF.type, bf.Publication))
        g.add((pa_lit, bf.agent, Literal(publisher)))
        g.add((pa_lit, bf.date, Literal(date)))
        g.add((pa_lit, bf.place, Literal(place)))
        pa_uri = BNode()
        g.add((id, bf.provisionActivity, pa_uri))
        g.add((pa_uri, RDF.type, bf.ProvisionActivity))
        g.add((pa_uri, RDF.type, bf.Publication))
        g.add((pa_uri, bf.agent, abra[label_to_relative_uri_map[publisher]]))
        g.add((pa_uri, bf.date, Literal(date)))
        g.add((pa_uri, bf.place, abrp[label_to_relative_uri_map[place]]))
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')

### Generate works

In [20]:
works = pd.DataFrame(df[['workTitle', 'work']]).drop_duplicates().to_dict('records')

for work in works:
    g = initialize_abr_graph()
    name = work['work']
    id = abrw[name]
    g.add((id, RDF.type, bf.Work))
    g.add((id, RDF.type, bf.Text))
    g.add((id, RDFS.label, Literal(work['workTitle'])))
    g.add((id, bf.contributor, abra['burroughs-william-s']))
    for inst in work_to_instance_map[name]:
        g.add((id, bf.hasInstance, abri[inst]))
    g.serialize(f"ttl/work/{name}.ttl", format='turtle')