## Load packages and set pandas option

In [1]:
import PyPDF2, re, pandas as pd, numpy as np
pd.set_option('display.max_rows', None)

## Extract text from the Anything But Routine v4.0 PDF

In [2]:
filename = 'pdf/eScholarship UC item 0xj4d6bm.pdf'
pdfFileObj = open(filename,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()



## Split text by works

In [3]:
text2 = re.split('\s*([A|B|C|D|E|F|G|H|I]\d+)\.\s+', text)

In [4]:
text3 = [ re.sub('\n', '', x).strip() for x in text2[1:] ]

In [5]:
d = {}
for i in range(0, len(text3)-1, 2):
    d[text3[i]] = text3[i+1].strip()

## Load text into dataframe

In [6]:
df = pd.DataFrame({ 'work': list(d.keys()), 'description': list(d.values()) })

## Partition and enhance data by work category (A through I)

### A. Publications in book, broadside, and pamphlet form

In [7]:
a_list_works = pd.DataFrame(df[df['work'].str.startswith('A')])

In [8]:
a_list_works['workTitle'] = a_list_works['description'].str.extract('^([\w \:\?\,\;\&\!\[\]\-\<\>\=\/]+)[\.]{1,3} ', expand=False)
a_list_works['notes'] = a_list_works['description']

In [9]:
works = a_list_works.where((a_list_works.notnull()), None).to_dict('records')
inst_chars = [ chr(i) for i in range(65, 65+26) ]
instances = []
for work in works:
    splt = re.split('([A-Z]\. )', work['description'])
    try:
        insts2 = []
        if len(splt) > 1 and splt[1] == 'A. ':
            insts = []
            for i in range(25):
                if splt.count(inst_chars[i+1]+'. ') > 0:
                    idx1 = splt.index(inst_chars[i]+'. ')
                    idx2 = splt.index(inst_chars[i+1]+'. ')
                    insts.append(splt[idx1:idx2])
                    splt = splt[idx2:]
                else:
                    break
            insts.append(splt)
            for i in insts:
                insts2.append([ i[0][0], ''.join(i[1:])])
        else:
            insts2 = [ [ 'A', ''.join(splt) ] ]
        for i in insts2:
            instance = { 
                'work': work['work'], 
                'workTitle': work['workTitle'], 
                'instance': i[0], 
                'instanceTitle': work['workTitle'], 
                'notes': i[1] 
            }
            instances.append(instance)
    except ValueError:
        pass
a_list_instances = pd.DataFrame(instances)

In [10]:
publisher_ptrn = '(\[?[\s\w\-\,]+\]?: \[?[\s\w\-\.]+\]?)[\,\.] [\0-9]'
a_list_instances['publication'] = a_list_instances['notes'].str.extract(publisher_ptrn, expand=False)

In [11]:
date_ptrn = '(\[?\d\d\d\d\??\]?)'
a_list_instances['date'] = a_list_instances['notes'].str.extract(date_ptrn, expand=False)

In [12]:
a_list_instances[['work', 'workTitle', 'instance', 'instanceTitle', 'date', 'publication', 'notes']]

Unnamed: 0,work,workTitle,instance,instanceTitle,date,publication,notes
0,A1,Junkie,A,Junkie,1953,New York: Ace Books,". New York: Ace Books, 1953. Softbound (no har..."
1,A1,Junkie,B,Junkie,[1957?],,Junkie: Confessions of an Unredeemed Drug Addi...
2,A1,Junkie,C,Junkie,[1964],,"Junkie. Foreword by Carl Solomon. (Ace Star, K..."
3,A1,Junkie,D,Junkie,1966,London: New English Library,. (The Olympia Press Travellers Companion Seri...
4,A1,Junkie,E,Junkie,1973,,. Foreword by Carl Solomon. Preface by Burroug...
5,A1,Junkie,F,Junkie,1977,New York: Penguin Books,Junky. With an introduction by Allen Ginsberg....
6,A1,Junkie,G,Junkie,1999,London: Penguin-UK,". London: Penguin-UK, 1999."
7,A1,Junkie,H,Junkie,2002,London: Penguin,". Introduction by Will Self. London: Penguin, ..."
8,A1,Junkie,I,Junkie,2003,New York: Penguin,Edited and with an Introduction by Oliver Har...
9,A1,Junkie,J,Junkie,2012,,. Edited and with an Introduction by Oliver Ha...


In [13]:
a_list_instances[['work', 'workTitle', 'instance', 'instanceTitle', 'date', 'publication', 'notes']].to_csv('csv/A.csv', index=False)

### B. Art books and catalogs of exhibitions (in progress)

In [14]:
b_list = pd.DataFrame(df[df['work'].str.startswith('B')])

In [15]:
b_list.to_csv('csv/B.csv', index=False)

### C. Contributions to periodicals (in progress)

In [16]:
c_list = pd.DataFrame(df[df['work'].str.startswith('C')])

In [17]:
c_list.to_csv('csv/C.csv', index=False)

### D. Foreign translations (in progress)

In [18]:
d_list = pd.DataFrame(df[df['work'].str.startswith('D')])

In [19]:
d_list.to_csv('csv/D.csv', index=False)

### E. Sound recordings (in progress)

In [20]:
e_list = pd.DataFrame(df[df['work'].str.startswith('E')])

In [21]:
e_list.to_csv('csv/E.csv', index=False)

### F. Film and video recordings (in progress)

In [22]:
f_list = pd.DataFrame(df[df['work'].str.startswith('F')])

In [23]:
f_list.to_csv('csv/F.csv', index=False)

### G. Miscellaneous items (in progress)

In [24]:
g_list = pd.DataFrame(df[df['work'].str.startswith('G')])

In [25]:
g_list.to_csv('csv/G.csv', index=False)

### H. Biographies of, interviews with, and letters by Burroughs (in progress)

In [26]:
h_list = pd.DataFrame(df[df['work'].str.startswith('H')])

In [27]:
h_list.to_csv('csv/H.csv', index=False)

### I. Bibliographies and criticism (in progress)

In [28]:
i_list = pd.DataFrame(df[df['work'].str.startswith('I')])

In [29]:
i_list.to_csv('csv/I.csv', index=False)