Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

removed <p> from cleaned txt

  • Loading branch information...
commit 073f40d95cc00e3eb1514360c20bbb828f5187e9 1 parent 97e5514
Ari Hershowitz authored
6 fileupload/utils/fileconvert.py
... ... @@ -1,6 +1,6 @@
1 1 import os
2 2 from lxml.html.clean import clean_html
3   -import subprocess
  3 +import subprocess, re
4 4
5 5 def convertpdf2html(pdffilepath):
6 6 subprocess.call(['pdftotext', '-layout', pdffilepath])
@@ -9,10 +9,12 @@ def convertpdf2html(pdffilepath):
9 9 txt = filename.read()
10 10 #txt = unicode(txt, "utf-8", errors = 'ignore')
11 11 txt = clean_html(txt)
  12 + txt = re.sub('^<p>','',txt)
  13 + txt = re.sub('</p>$','',txt)
12 14
13 15 with open(txtfilepath, 'wb') as filename:
14 16 filename.write(txt)
15   - p = subprocess.Popen(['txt2html', '--explicitheadings', 'indentparbreak', 'make_anchors', '--tables', '-8', '--xhtml', txtfilepath ], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
  17 + p = subprocess.Popen(['txt2html', '--explicit_headings', 'indent_par_break', 'make_anchors', '--tables', '-8', '--xhtml', txtfilepath ], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
16 18 htmltext, stderr_text = p.communicate()
17 19 return htmltext
18 20
10 utils/pdf2html.py
... ... @@ -1,10 +0,0 @@
1   -import os
2   -import subprocess
3   -
4   -def convert(pdffilepath):
5   - subprocess.call(['pdftotext', '-layout', pdffilepath])
6   - p = subprocess.Popen(['txt2html', '-tables', '--xhtml', pdffilepath.rstrip('.pdf')+'.txt'], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
7   - htmltext, stderr_text = p.communicate()
8   - return htmltext
9   -
10   -#TODO: Create hash for url, store html with hash ID, return url to user, display html at url
BIN  utils/pdf2html.pyc
Binary file not shown

0 comments on commit 073f40d

Please sign in to comment.
Something went wrong with that request. Please try again.