import sys
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
"""Some statistics on the German educational system.
Extracted from PDF files from the DAAD website:
#PDF text extractor
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
text = retstr.getvalue()
return text
def main():
args = sys.argv[1:]
if not args:
print 'usage: python file [file ...]'
f = open('summary.txt', 'w')
# For each filename get the stats
for filename in args:
#grab text from PDF file
text_org = convert_pdf_to_txt(filename)
#grab name of Federal State from PDF header
temp ='([\w.-]+)\s+Deutschland', text_org)
state =
print state
#remove letters
text_wo_letters = re.sub(r'[a-zA-Z]', r'', text_org)
#remove percent symbols
text_wo_perc = re.sub(r'%', r'', text_wo_letters)
#convert German number format to US format
text_wo_frac = re.sub(r'[0-9]+,[0-9]+', r'', text_wo_perc)
text = re.sub(r'\.', r'', text_wo_frac)
#extract the quantities
temp ='([\w.-]+)\s+80523800', text)
inhabitants =
temp ='([\w.-]+)\s+2399409', text)
students =
temp ='([\w.-]+)\s+495088', text)
freshmen =
temp ='([\w.-]+)\s+413338', text)
degrees =
temp ='([\w.-]+)\s+353690', text)
employees =
temp ='([\w.-]+)\s+43862', text)
professors =
temp ='([\w.-]+)\s+204644', text)
foreign_students =
temp ='([\w.-]+)\s+79537', text)
foreign_freshmen =
temp ='([\w.-]+)\s+30806', text)
foreign_degrees =
temp ='([\w.-]+)\s+35345', text)
foreign_employees =
temp ='([\w.-]+)\s+2778', text)
foreign_professors =
#write to stdout and file
print state, inhabitants, students, freshmen, degrees, employees, professors, foreign_students, foreign_freshmen, foreign_degrees, foreign_employees, foreign_professors
if __name__ == '__main__':