Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

exceptions with bad parsing #9

Closed
mitechie opened this issue Aug 23, 2012 · 3 comments
Closed

exceptions with bad parsing #9

mitechie opened this issue Aug 23, 2012 · 3 comments

Comments

@mitechie
Copy link
Member

Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 551, in *bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 65, in fetch_content
read = ReadUrl.parse(url)
File "/home/rharding/src/bookie/bookie/lib/readable.py", line 171, in parse
if not document.readable:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 93, in html
return self._parse(self.orig_html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 80, in _parse
doc = build_doc(html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 54, in build_doc
page_unicode = page.decode(enc, 'replace')
TypeError: decode() argument 1 must be string, not None

@mitechie
Copy link
Member Author

File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 93, in html
return self._parse(self.orig_html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 80, in _parse
doc = build_doc(html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 57, in build_doc
parser=utf8_parser)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/lxml/html/init.py", line 532, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 594, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72087)
XMLSyntaxError: line 563: Tag figure invalid

@mitechie
Copy link
Member Author

[D 120826 11:32:46 existing:67] Q0 getting content for 4c3edf3a8229cd http://www.dafont.com/

Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.6/threading.py", line 532, in *bootstrap_inner
self.run()
File "/usr/lib/python2.6/threading.py", line 484, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 68, in fetch_content
read = ReadUrl.parse(url)
File "/home/bmark.us/0.5/bookie/lib/readable.py", line 176, in parse
if not document.readable:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 95, in html
return self._parse(self.orig_html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 82, in _parse
doc = build_doc(html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 59, in build_doc
parser=utf8_parser)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/lxml/html/init.py", line 534, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 596, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72123)
XMLSyntaxError: None

@mitechie
Copy link
Member Author

[D 120827 11:13:41 existing:67] Q0 getting content for 4c3edf3a8229cd http://www.dafont.com/
[D 120827 11:13:41 existing:67] Q1 getting content for 9feafedb1e468b http://www.redbullmusicacademy.com/
[D 120827 11:13:41 existing:67] Q2 getting content for 1ca8c1b6cb8e08 http://cameratoss.blogspot.com/
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.6/threading.py", line 532, in *bootstrap_inner
self.run()
File "/usr/lib/python2.6/threading.py", line 484, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 68, in fetch_content
read = ReadUrl.parse(url)
File "/home/bmark.us/0.5/bookie/lib/readable.py", line 176, in parse
if not document.readable:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 95, in html
return self._parse(self.orig_html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 82, in _parse
doc = build_doc(html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 59, in build_doc
parser=utf8_parser)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/lxml/html/init.py", line 534, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 594, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72087)
XMLSyntaxError: line 1887: htmlParseEntityRef: expecting ';'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant