In [1]:
import PyPDF2, re, pandas as pd, numpy as np

## Transform the Schottlaender v4.0 PDF to a Pandas dataframe

In [2]:
filename = 'eScholarship UC item 0xj4d6bm.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()

In [3]:
text2 = re.split('\s*([A|B|C|D|E|F|G|H|I]\d+)\.\s+', text)

In [4]:
text3 = [ re.sub('\n', '', x).strip() for x in text2[1:] ]

In [5]:
d = {}
for i in range(0, len(text3)-1, 2):
    d[text3[i]] = text3[i+1].strip()

In [6]:
df = pd.DataFrame({ 'work': list(d.keys()), 'description': list(d.values()) })

## Extract and enhance data by FRBR expression type (A through I)

### A. Publications in book, broadside, and pamphlet form

In [8]:
a_list = pd.DataFrame(df[df['work'].str.startswith('A')])

In [9]:
year_ptrn = '(?P<year>\d\d\d\d)'

In [10]:
publisher_ptrn = '(?P<publisher>\w[\s\w\,\]\[+: [\s\w]+), '

In [11]:
title_ptrn = '(?P<title>^[A-Za-z0-9 \:\-\?]+)'

In [12]:
a_list['title'] = a_list['description'].str.extract(title_ptrn, expand=False)
a_list['year'] = a_list['description'].str.extract(year_ptrn, expand=False)
a_list['publisher'] = a_list['description'].str.extract(publisher_ptrn, expand=False)

In [13]:
a_list[['work', 'year', 'title', 'publisher', 'description']].tail()

Unnamed: 0,work,year,title,publisher,description
77,A78,2001,Parler pour Joe,"Rouen, France: Derrière la Salle de Bains","Parler pour Joe. Rouen, France: Derrière la Sa..."
78,A79,2001,Words of Advice for Young People,FreeThought Flyer,Words of Advice for Young People. (FreeThought...
79,A80,2008,Everything Lost: The Latin American Notebook o...,General editors,Everything Lost: The Latin American Notebook o...
80,A81,2008,And the Hippos Were Boiled in Their Tanks,New York: Grove Press,And the Hippos Were Boiled in Their Tanks. Wit...
81,A82,2015,The Travel Agency Is on Fire,Colan,The Travel Agency Is on Fire. Alex Wermer-Cola...


In [14]:
a_list[['work', 'year', 'title', 'publisher', 'description']].to_csv('A.csv')

### B. Art books and catalogs of exhibitions

In [15]:
b_list = pd.DataFrame(df[df['work'].str.startswith('B')])

In [16]:
b_title_ptrn = '(.+)[\.]\ [a-zA-Z0-9\ ]+\:'

In [17]:
b_list['title'] = b_list['description'].str.extract(b_title_ptrn, expand=False)

In [18]:
b_list['publisher'] = b_list['description'].str.extract(publisher_ptrn, expand=False)

In [19]:
b_list['year'] = b_list['description'].str.extract(year_ptrn, expand=False)

In [20]:
b_list[['work', 'year', 'title', 'publisher', 'description']]

Unnamed: 0,work,year,title,publisher,description
82,B1,1964,"Peinture, Poésie, Musique: David Budd Recontre...","Peinture, Poésie","Peinture, Poésie, Musique: David Budd Recontre..."
83,B2,1974,"Ruby Editions Portfolio, One. With Cozette de ...",Ruby Editions Portfolio,"Ruby Editions Portfolio, One. With Cozette de ..."
84,B3,1988,William Burroughs: Painting,London: October Gallery,William Burroughs: Painting. Amsterdam: Suzann...
85,B4,1988,,Santa Fe,"William S. Burroughs. Santa Fe, N.M.: Gallery ..."
86,B5,1988,William S. Burroughs. London: The October Gall...,London: The October Gallery,William S. Burroughs. London: The October Gall...
87,B6,1988,William S. Burroughs. New York: Tony Shafrazi ...,New York: Tony Shafrazi Gallery,William S. Burroughs. New York: Tony Shafrazi ...
88,B7,1988,"William S. Burroughs, October 21November 26, 1988","Burroughs, October 21November 26","William S. Burroughs, October 21November 26, 1..."
89,B8,1988,William S. Burroughs: Recent Paintings,Vancouver: Front Gallery,William S. Burroughs: Recent Paintings. Vancou...
90,B9,1989,Clignett/Burroughs,Basel: Carzaniga + Ueker,"Clignett/Burroughs. Basel: Carzaniga + Ueker, ..."
91,B10,1989,William S. Burroughs. Introduction by Achille ...,Rome: Cleto Polcina Artemoderna,William S. Burroughs. Introduction by Achille ...


In [21]:
b_list[['work', 'year', 'title', 'publisher', 'description']].to_csv('B.csv')

### C. Contributions to periodicals

In [22]:
c_list = pd.DataFrame(df[df['work'].str.startswith('C')])

In [23]:
c_title_ptrn = '(.+[\)\]])\.\s*'

In [24]:
c_list['title'] = c_list['description'].str.extract(c_title_ptrn, expand=False)

In [25]:
c_list['year'] = c_list['description'].str.extract(year_ptrn, expand=False)

In [26]:
c_list[['work', 'year', 'title', 'description']].tail()

Unnamed: 0,work,year,title,description
775,C663,2004,"Weirdly Supernatural, No. 2 (2004)","Weirdly Supernatural, No. 2 (2004). 2 0 0 5"
776,C664,2005,"Purple Fashion, No. 4 (2005)","Purple Fashion, No. 4 (2005). Interzone 91 ..."
777,C665,2007,"Inkblot, [No.] 11 (2007). Includes a facsimil...","Inkblot, [No.] 11 (2007). Includes a facsimil..."
778,C666,2012,"Sensitive Skin, No. 8 (2012)","Sensitive Skin, No. 8 (2012). the many hour..."
779,C667,2014,"Beat Scene, No. 71a (Winter 2014). (by Gre...","Beat Scene, No. 71a (Winter 2014). (by Gre..."


In [27]:
c_list.to_csv('C.csv')

### D. Foreign translations

In [28]:
d_list = pd.DataFrame(df[df['work'].str.startswith('D')])

In [29]:
d_list.to_csv('D.csv')

### E. Sound recordings

In [30]:
e_list = pd.DataFrame(df[df['work'].str.startswith('E')])

In [31]:
e_list.to_csv('E.csv')

### F. Film and video recordings

In [32]:
f_list = pd.DataFrame(df[df['work'].str.startswith('F')])

In [33]:
f_list.to_csv('F.csv')

### G. Miscellaneous items

In [34]:
g_list = pd.DataFrame(df[df['work'].str.startswith('G')])

In [35]:
g_list.to_csv('G.csv')

### H. Biographies of, interviews with, and letters by Burroughs

In [36]:
h_list = pd.DataFrame(df[df['work'].str.startswith('H')])

In [37]:
h_list.to_csv('H.csv')

### I. Bibliographies and criticism

In [38]:
i_list = pd.DataFrame(df[df['work'].str.startswith('I')])

In [39]:
i_list.to_csv('I.csv')