In [56]:
from docx import Document
import glob, csv

In [21]:
document = Document('REStud papers/23034 Draca/draca_RESTUD Manuscript Submission Form_2017_11_19.docx')

In [96]:
def get_text_fields(doc):
    return [i.text.strip(' :') for i in doc.element.xpath('//w:t')]

def get_value(doc, key):
    cells = get_text_fields(doc)
    if key in cells:
        return cells[cells.index(key.strip(' :'))+1]
    else:
        return ''

def get_between(doc, start, end):
    cells = get_text_fields(doc)
    if start not in cells:
        return ''
    i = cells.index(start)
    if end not in cells[i+1:]:
        return cells[i+1]
    j = cells[i+1:].index(end)+i+1
    return '\n'.join(cells[i+1:j])

FIELDS = dict(ms_number=(get_value, 'Number'), title=(get_between, 'Article Title', 'Manuscript'), 
              author=(get_value, 'Last Name'), email=(get_value, 'E-mail Address'),
             editor=(get_value, 'Accepting Editor'), accepted_date=(get_value, 'Accepted Date'))

def get_fields(doc, fields):
    return {key:fields[key][0](doc, *fields[key][1:]) for key in fields}

In [97]:
get_fields(document, FIELDS)

{'ms_number': '23034',
 'title': 'The Changing Returns to Crime: Do Criminals Respond to Prices?',
 'author': 'Draca',
 'email': 'm.draca@warwick.ac.uk',
 'editor': 'Jerome Adda',
 'accepted_date': '2017-06-20'}

In [98]:
forms = glob.glob('REStud papers/????? */*.docx')

In [99]:
len(forms)

124

In [104]:
writer = csv.DictWriter(open('manuscripts.csv', 'w'), fieldnames=list(FIELDS.keys()))
writer.writeheader()

In [105]:
i = 0
for form in forms:
    i += 1
    print(i, form)
    try:
        writer.writerow(get_fields(Document(form), FIELDS))
    except:
        print('{} failed.'.format(form))
        get_fields(Document(form), FIELDS)


1 REStud papers/16267 Ripoll/RESTUD Manuscript Submission Form[1].docx
2 REStud papers/16414 Xu/RESTUD Manuscript Submission Form.docx
3 REStud papers/17444 Li/RESTUD Manuscript Submission Form.docx
4 REStud papers/17762 Satchi/RESTUD Manuscript Submission Form.docx
5 REStud papers/17847 Di Giorgi/RESTUD Manuscript Submission Form.docx
6 REStud papers/18075 Rosenzweig/RESTUD Manuscript Submission Form.docx
7 REStud papers/18080 Duffy/RESTUD Manuscript Submission Form.docx
8 REStud papers/18332 Lessem/RESTUD Manuscript Submission Form.docx
9 REStud papers/18592 MeyerTerVehn/RESTUD Manuscript Submission Form.docx
10 REStud papers/18706 Duranton/RESTUD Manuscript Submission Form.docx
11 REStud papers/18771 Malenko/RESTUD Manuscript Submission Form.docx
12 REStud papers/18931 Stavrakeva/RESTUD Manuscript Submission Form.docx
13 REStud papers/19013 Kim/RESTUD Manuscript Submission Form.docx
14 REStud papers/19019 Laczo/MS19019 RESTUD Manuscript Submission Form.docx
15 REStud papers/19047 Ll

In [102]:
get_fields(Document('REStud papers/17444 Li/RESTUD Manuscript Submission Form.docx'), FIELDS)

{'ms_number': '17444',
 'title': 'Growth Through Inter-\nsectoral\nKnowledge Linkages',
 'author': 'Li',
 'email': 'Nanli1@gmail.com',
 'editor': 'Francesco Caselli',
 'accepted_date': ''}

In [103]:
get_text_fields(Document('REStud papers/17444 Li/RESTUD Manuscript Submission Form.docx'))

['Manuscript',
 'Submission',
 'Form',
 'The template belo',
 'w is to be used when submitting files for Accepted',
 'Restud',
 'articles',
 'to O',
 'xford',
 'U',
 'niversity',
 'P',
 'ress',
 '. You',
 'do not',
 'need fill in all categories if they are not relevant but please provide as much key information for the typesetting team as possible.',
 'MANUSCRIPT INFORMATION',
 'Article Title',
 'Growth Through Inter-',
 'sectoral',
 'Knowledge Linkages',
 'Manuscript',
 'Number',
 '17444',
 'All Authors',
 'and Affiliations',
 '',
 '[list the names of',
 'all of',
 'the authors',
 'and their affiliations',
 'as they are to appear in the paper here]',
 'Jie Cai, Shanghai University of Finance and Economics',
 'Nan Li, International Monetary Fund',
 'Original Submission Date',
 '2012-11-05',
 'Editorial Decision',
 'Date',
 '2018-09-12',
 'Date submitted to OUP',
 '2018-10-09',
 'Total no of figures',
 '',
 '8',
 'Total no of tables',
 '7',
 'Accepting Editor',
 'Francesco Caselli',
 'C