Permalink
Browse files

removed <p> from cleaned txt

  • Loading branch information...
aih committed Feb 22, 2012
1 parent 97e5514 commit 073f40d95cc00e3eb1514360c20bbb828f5187e9
Showing with 4 additions and 12 deletions.
  1. +4 −2 fileupload/utils/fileconvert.py
  2. +0 −10 utils/pdf2html.py
  3. BIN utils/pdf2html.pyc
@@ -1,6 +1,6 @@
import os
from lxml.html.clean import clean_html
-import subprocess
+import subprocess, re
def convertpdf2html(pdffilepath):
subprocess.call(['pdftotext', '-layout', pdffilepath])
@@ -9,10 +9,12 @@ def convertpdf2html(pdffilepath):
txt = filename.read()
#txt = unicode(txt, "utf-8", errors = 'ignore')
txt = clean_html(txt)
+ txt = re.sub('^<p>','',txt)
+ txt = re.sub('</p>$','',txt)
with open(txtfilepath, 'wb') as filename:
filename.write(txt)
- p = subprocess.Popen(['txt2html', '--explicitheadings', 'indentparbreak', 'make_anchors', '--tables', '-8', '--xhtml', txtfilepath ], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
+ p = subprocess.Popen(['txt2html', '--explicit_headings', 'indent_par_break', 'make_anchors', '--tables', '-8', '--xhtml', txtfilepath ], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
htmltext, stderr_text = p.communicate()
return htmltext
View
@@ -1,10 +0,0 @@
-import os
-import subprocess
-
-def convert(pdffilepath):
- subprocess.call(['pdftotext', '-layout', pdffilepath])
- p = subprocess.Popen(['txt2html', '-tables', '--xhtml', pdffilepath.rstrip('.pdf')+'.txt'], bufsize=-1, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr= subprocess.PIPE, close_fds = True)
- htmltext, stderr_text = p.communicate()
- return htmltext
-
-#TODO: Create hash for url, store html with hash ID, return url to user, display html at url
View
Binary file not shown.

0 comments on commit 073f40d

Please sign in to comment.