Skip to content

Commit

Permalink
Fixing unittests
Browse files Browse the repository at this point in the history
  • Loading branch information
andresriancho committed Jan 5, 2015
1 parent 87f0976 commit b0b544e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 6 deletions.
19 changes: 14 additions & 5 deletions w3af/core/data/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
"""
import StringIO

from pdfminer.converter import TextConverter, HTMLConverter
from pdfminer.converter import HTMLConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFSyntaxError

from w3af.core.data.parsers.baseparser import BaseParser
from w3af.core.data.url.HTTPResponse import ANY_TAG_MATCH
from w3af.core.data.parsers.utils.re_extract import ReExtract


Expand Down Expand Up @@ -112,8 +112,10 @@ def pdf_to_text(pdf_string):

# According to https://github.com/euske/pdfminer/issues/61 it is a good idea
# to set laparams to None, which will speed-up parsing
device = HTMLConverter(rsrcmgr, output, codec='utf-8', layoutmode='normal',
laparams=None, imagewriter=None)
device = NoPageHTMLConverter(rsrcmgr, output, codec='utf-8',
layoutmode='normal',
laparams=None, imagewriter=None,
showpageno=False)

document_io = StringIO.StringIO(pdf_string)
pagenos = set()
Expand All @@ -128,4 +130,11 @@ def pdf_to_text(pdf_string):

device.close()
output.seek(0)
return output.read().decode('utf-8')
output_str = output.read().decode('utf-8')
return ANY_TAG_MATCH.sub('', output_str)


class NoPageHTMLConverter(HTMLConverter):
def write_footer(self):
self.write('</body></html>\n')
return
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_extract_pdf(self):
pdf_inst = pdf()

words = pdf_inst._get_pdf_content(file(fname).read())

EXPECTED_RESULT = ['Testing,', 'testing,', '123.', 'Text', 'in',
'page', 'number', 'two.']
self.assertEqual(EXPECTED_RESULT, words)

0 comments on commit b0b544e

Please sign in to comment.