In [74]:
import csv
import glob
import pickle
import re
from collections import defaultdict
from itertools import chain

In [56]:
SOURCE_FILES = glob.glob('source/*.csv')

In [66]:
EXPECTED_HEADERS = ['url', 'title', 'authors', 'abstract', 'subject']

def parse_authors(authors_string):
    return [author.strip() for author in re.split(r',\s+', authors_string)]

def subject_code(subject):
    m = re.search(r'\((.+)\)$', subject)
    
    if m is None: raise(ValueError, "Invalid subject format: `%s`" % (subject,))
        
    return m.group(1)

def parse_subjects(subject_string):
    return [subject_code(s) for s in subject_string.split('; ')]

def read_csv(filename):
    rows = []
    got_headers = False
    
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        
        for row in reader:
            if got_headers:
                rows.append({
                    'title': row[1],
                    'authors': parse_authors(row[2]),
                    'abstract': row[3],
                    'subject': parse_subjects(row[4]),
                })
            else:
                assert(row == EXPECTED_HEADERS)
                got_headers = True
            
    return rows

In [67]:
all_entries = list(chain.from_iterable((read_csv(source) for source in SOURCE_FILES)))
print("%d articles from %d files" % (len(all_entries), len(SOURCE_FILES)))

7742 articles from 15 files


In [68]:
by_subject = defaultdict(list)

for entry in all_entries:
    for subject in entry['subject']:
        by_subject[subject].append(entry)

In [72]:
for k in sorted(by_subject, key=lambda k: len(by_subject[k]), reverse=True):
    print("%60s: %d" % (k, len(by_subject[k])))

                                                    quant-ph: 817
                                                     math-ph: 692
                                                     math.AP: 686
                                                     nucl-th: 650
                                                     nucl-ex: 647
                                                     math.DG: 600
                                                 astro-ph.SR: 592
                                          cond-mat.stat-mech: 589
                                          cond-mat.quant-gas: 558
                                                 astro-ph.EP: 550
                                              physics.optics: 542
                                                     math.CO: 537
                                                     math.AG: 535
                                                       cs.AI: 525
                                                       cs.IT: 522
          

In [76]:
with open('preprocessed_arxiv.pickle', 'wb') as f:
    pickle.dump(by_subject, f, pickle.HIGHEST_PROTOCOL)