## Load packages, initialize sentence tokenization resource

In [1]:
import numpy as np, pandas as pd, rdflib, re, nltk, collections
from rdflib.namespace import RDF, RDFS
from rdflib import URIRef, BNode, Literal
from slugify import slugify
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/bradleyallen/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Load CSV file for ABR A list

In [2]:
df = pd.read_csv('edited-csv/A.csv')

## Extract entity names and features

### Normalize all whitespace to spaces in notes

In [3]:
df['notes'] = df['notes'].str.replace('\s', ' ')

### Places

In [4]:
df['place'] = df['publication'].str.extract('(.+)\:', expand=False)

### Publishers

In [5]:
df['publisher'] = df['publication'].str.extract('\: (.+)', expand=False)

### Copyright dates

In [6]:
df['copyrightDate'] = df['notes'].str.extract('©(\w+)', expand=False)

### Identifiers

In [7]:
df['MaynardAndMiles'] = df['notes'].str.extract('\{M\&M (\S*)\}', expand=False)

In [8]:
df['Schottlaender'] = df['work'] + df['instance'].str.lower()

### Bindings (in progress)

In [9]:
df['binding'] = df['notes'].str.extract('([A-Z]\S+bound[^.]*)\.', expand=False)
df['binding'] = df['binding'].str.strip()

In [10]:
df['binding'].value_counts().sort_index()

Hardbound                                                                                                1
Hardbound (issued without dustjacket)                                                                    2
Hardbound (issued without dustjacket), and softbound                                                     4
Hardbound in dustjacket                                                                                 17
Hardbound in dustjacket (no softbound issued)                                                            7
Hardbound in dustjacket, and softbound                                                                  19
Hardbound in dustjacket, issued with compact disc bound in rear                                          1
Hardbound in tissue dustjacket, and softbound                                                            1
Hardbound with hand-made orange tissue guards inserted                                                   1
Hardbound, and softbound             

### Promotional materials (in progress)

In [11]:
df['promotionalMaterial'] = df['notes'].str.extract(' 1. (\[.+$)', expand=False)

In [12]:
df[df.promotionalMaterial.notnull()][['Schottlaender', 'promotionalMaterial']]

Unnamed: 0,Schottlaender,promotionalMaterial
12,A2b,"[Prospectus]. New York: Grove Press, [1962]. P..."
95,A24a,[Prospectus]. In English and French. Ingatesto...
96,A24b,[Prospectus]. Göttingen: Expanded Media Editio...
140,A42a,"[Prospectus]. Santa Barbara, Calif.: Bradford ..."
160,A54a,[Promotional Postcard]. San Francisco: City Li...
162,A56a,"[Press Kit]. New York: Holt, Rinehart and Wins..."
174,A59a,"[Press Release]. New York: Viking Penguin, [19..."
190,A65a,"[Promotional Postcard] a. Berkeley, Calif.: Sm..."
191,A66a,[Prospectus]. New York: Whitney Museum of Amer...
192,A66b,[Promotional Poster]. [New York]: High Risk Bo...


### Squeeze even more data out of notes using more nuanced NLP (in progress)

In [13]:
instances = df.where((pd.notnull(df)), None).to_dict('records')
for i in instances:
    #print(i['Schottlaender'])
    notes = i['notes']
    if notes:
        notes_text = nltk.sent_tokenize(notes)
        for sentence in notes_text:
            tokenized = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(tokenized)
            pass

## Generate BIBFRAME Turtle files

### Initialize namespaces

In [14]:
abri = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/instance/")
abrw = rdflib.Namespace("https://w3id.org/anything-but-routine/4.0/work/")
bf = rdflib.Namespace("http://id.loc.gov/ontologies/bibframe/")
arm = rdflib.Namespace("https://w3id.org/arm/core/ontology/0.1/")

### Define function to initialize graphs

In [15]:
def initialize_abr_graph():
    g = rdflib.Graph()
    g.bind("abri", "https://w3id.org/anything-but-routine/4.0/instance/")
    g.bind("abrw", "https://w3id.org/anything-but-routine/4.0/work/")
    g.bind("bf", "http://id.loc.gov/ontologies/bibframe/")
    g.bind("arm", "https://w3id.org/arm/core/ontology/0.1/")
    return g

### Generate instances

In [16]:
%%time
instances = df.where((pd.notnull(df)), None).to_dict('records')
work_to_instance_map = collections.defaultdict(list)

for i in instances:
    g = initialize_abr_graph()
    name = i['Schottlaender']
    work_to_instance_map[i['work']].append(name)
    id = abri[name]
    g.add((id, RDF.type, bf.Instance))
    g.add((id, RDFS.label, Literal(i['workTitle'])))
    g.add((id, bf.instanceOf, abrw[i['work']]))
    # bf.contributor
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person))
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    # bf:title
    title = BNode()
    g.add((id, bf.title, title))
    g.add((title, RDF.type, bf.Title))
    g.add((title, RDFS.label, Literal(i['instanceTitle'])))
    # bf:identifiedBy
    schottlaender_id = BNode()
    g.add((id, bf.identifiedBy, schottlaender_id))
    g.add((schottlaender_id, RDF.type, bf.Identifier))
    g.add((schottlaender_id, bf.source, Literal("Schottlaender v4.0")))
    g.add((schottlaender_id, RDF.value, Literal(name)))
    m_n_m = i['MaynardAndMiles']
    if m_n_m:
        m_n_m_id = BNode()
        g.add((id, bf.identifiedBy, m_n_m_id))
        g.add((m_n_m_id, RDF.type, bf.Identifier))
        g.add((m_n_m_id, bf.source, Literal("Maynard & Miles")))
        g.add((m_n_m_id, RDF.value, Literal(m_n_m)))
    # bf:note
    notes = i['notes']
    if notes:
        notes_text = nltk.sent_tokenize(notes)
        for sentence in notes_text:
            note = BNode()
            g.add((id, bf.note, note))
            g.add((note, RDF.type, bf.Note))
            g.add((note, RDF.value, Literal(sentence)))
            # Contributors extracted from note
            tokenized = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(tokenized)
            namedEnt = nltk.ne_chunk(tagged)
            for t in namedEnt.subtrees():
                if t.label() == 'PERSON':
                    person_tokens = [ c[0] for c in t ]
                    person_name = ' '.join(person_tokens)
                    person = BNode()
                    g.add((id, bf.contributor, person))
                    g.add((person, RDF.type, bf.Agent))
                    g.add((person, RDF.type, bf.Person)) 
                    g.add((person, bf.role, Literal("contributor")))
                    g.add((person, RDFS.label, Literal(person_name)))
    # bf:provisionActivity
    publisher = i['publisher']
    date = i['date']
    place = i['place']
    if publisher and date and place:
        pa_lit = BNode()
        g.add((id, bf.provisionActivity, pa_lit))
        g.add((pa_lit, RDF.type, bf.ProvisionActivity))
        g.add((pa_lit, RDF.type, bf.Publication))
        g.add((pa_lit, bf.agent, Literal(publisher)))
        g.add((pa_lit, bf.date, Literal(date)))
        g.add((pa_lit, bf.place, Literal(place)))
    # bf:copyrightDate
    copyrightDate = i['copyrightDate']
    if copyrightDate:
        g.add((id, bf.copyrightDate, Literal(copyrightDate)))
    g.serialize(f"ttl/instance/{name}.ttl", format='turtle')

CPU times: user 18.1 s, sys: 705 ms, total: 18.8 s
Wall time: 46.2 s


### Generate works

In [17]:
works = pd.DataFrame(df[['workTitle', 'work']]).drop_duplicates().to_dict('records')

for work in works:
    g = initialize_abr_graph()
    name = work['work']
    id = abrw[name]
    g.add((id, RDF.type, bf.Work))
    g.add((id, RDF.type, bf.Text))
    g.add((id, RDFS.label, Literal(work['workTitle'])))
    wsb = BNode()
    g.add((id, bf.contributor, wsb))
    g.add((wsb, RDF.type, bf.Agent))
    g.add((wsb, RDF.type, bf.Person)) 
    g.add((wsb, bf.role, Literal("author")))
    g.add((wsb, RDFS.label, Literal("William S. Burroughs")))
    for inst in work_to_instance_map[name]:
        g.add((id, bf.hasInstance, abri[inst]))
    g.serialize(f"ttl/work/{name}.ttl", format='turtle')