Skip to content
This repository has been archived by the owner on Dec 11, 2021. It is now read-only.

Commit

Permalink
Simplify pdfminer by upgrading to pdfminer.six.
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Ferrier committed Aug 19, 2020
1 parent 835c636 commit bfe3e4b
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 17 deletions.
2 changes: 1 addition & 1 deletion requirements_hacking.txt
@@ -1,6 +1,6 @@
flake8
freezegun
nose
pdfminer3k
pdfminer.six
reportlab
requests
20 changes: 4 additions & 16 deletions tests/BaseTestClasses.py
Expand Up @@ -8,10 +8,6 @@
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.pdftypes import PSException
from reportlab.pdfgen import canvas
from requests.exceptions import RequestException
from subprocess import Popen, PIPE
Expand All @@ -22,6 +18,7 @@
import inspect
import os
import os.path
import pdfminer.high_level
import requests
import shutil
import sys
Expand Down Expand Up @@ -370,18 +367,9 @@ def getMetadataField(self, pdf_filename, field_name):
return None

def getPDFText(self, filename):
try:
with io.StringIO() as retstr:
with open(filename, 'rb') as filehandle:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
pagenos = set()
process_pdf(rsrcmgr, device, filehandle, pagenos, maxpages=0, password="", caching=True, check_extractable=True)
device.close()
string = retstr.getvalue()
return string
except PSException:
return None
text = pdfminer.high_level.extract_text(filename)
text = text.replace("\t", " ")
return text

def touch(self, fname):
open(fname, 'w').close()
Expand Down

0 comments on commit bfe3e4b

Please sign in to comment.