Browse files

Now parsing documents using HTML5, which can deal better with tag soup

  • Loading branch information...
1 parent 40ec3e8 commit 778e0d899a32ffb0da4d8766a13ccb2822a6ac17 @assaf committed Dec 28, 2010
Showing with 106 additions and 17 deletions.
  1. +6 −1 CHANGELOG.md
  2. +0 −9 TODO.md
  3. +1 −1 package.json
  4. +19 −3 spec/browser-spec.coffee
  5. +1 −1 spec/forms-spec.coffee
  6. +1 −0 spec/helpers.coffee
  7. +8 −2 src/zombie/history.coffee
  8. +70 −0 src/zombie/jsdom_patches.coffee
View
7 CHANGELOG.md
@@ -1,13 +1,18 @@
zombie.js changelog(1) -- Changelog
===================================
-## Version 0.7.4 2010-12-27
+## Version 0.7.4 2010-12-28
+
+Now parsing documents using HTML5, which can deal better with tag soup.
Added troublehsooting guide.
Fixed naming issue: browser.last_request is now lastRequest, same for
lastResponse and lastError.
+ 189 Tests
+ 2.3 sec to complete
+
## Version 0.7.3 2010-12-27
View
9 TODO.md
@@ -1,15 +1,6 @@
zombie.js todo(1) -- Wishlist
=============================
-* Use HTML5 parser (see https://github.com/aredridel/html5)
-
- HTML5 parser can deal with many more documents (e.g. missing html/body
- elements) than html-parser, and obviously new HTML5 elements.
-
- Unfortunately, it adds script elements to the DOM before adding their text
- content; JSDOM listens to the DOMNodeInsertedIntoDocument event, which is
- fired on empty script element.
-
* Navigation: Browser.open/close should work as a pair; look into supporting
window.open; fire unload event when navigating away from page.
View
2 package.json
@@ -23,7 +23,7 @@
"node": "*"
},
"dependencies": {
- "htmlparser": ">= 1.7.3",
+ "html5": ">= 0.2.5",
"jsdom": ">= 0.1.21"
},
"repository": {
View
22 spec/browser-spec.coffee
@@ -65,10 +65,16 @@ brains.get "/dead", (req, res)-> res.send """
</html>
"""
+brains.get "/soup", (req, res)-> res.send """
+ <h1>Tag soup</h1>
+ <p>One paragraph
+ <p>And another
+ """
+
brains.get "/script/write", (req, res)-> res.send """
<html>
<head>
- <script>document.write(unescape(\'%3Cscript src="/jquery.js"%3E%3C/script%3E\')</script>
+ <script>document.write(unescape(\'%3Cscript src="/jquery.js"%3E%3C/script%3E\'));</script>
</head>
<body>
<script>
@@ -100,7 +106,7 @@ vows.describe("Browser").addBatch(
"open page":
zombie.wants "http://localhost:3003/scripted"
"should create HTML document": (browser)-> assert.instanceOf browser.document, jsdom.dom.level3.html.HTMLDocument
- "should load document from server": (browser)-> assert.match browser.html(), /<body>Hello World<\/body>/
+ "should load document from server": (browser)-> assert.match browser.html(), /<body>Hello World/
"should load external scripts": (browser)->
assert.ok jQuery = browser.window.jQuery, "window.jQuery not available"
assert.typeOf jQuery.ajax, "function"
@@ -139,7 +145,6 @@ vows.describe("Browser").addBatch(
browser.wants "http://localhost:3003/"
"should fire done event": (browser)-> assert.ok browser.visit
-
"content selection":
zombie.wants "http://localhost:3003/living"
"query text":
@@ -170,12 +175,23 @@ vows.describe("Browser").addBatch(
"should change location": (browser)-> assert.equal browser.location, "http://localhost:3003/living#/"
"should process event": (browser)-> assert.equal browser.document.title, "Signed up"
+ "tag soup":
+ zombie.wants "http://localhost:3003/soup"
+ "should parse to complete HTML": (browser)->
+ assert.ok browser.querySelector("html head")
+ assert.equal browser.text("html body h1"), "Tag soup"
+ "should close tags": (browser)->
+ paras = browser.querySelectorAll("body p").toArray().map((e)-> e.textContent.trim())
+ assert.deepEqual paras, ["One paragraph", "And another"]
+
"adding script using document.write":
zombie.wants "http://localhost:3003/script/write"
"should run script": (browser)-> assert.equal browser.document.title, "Script document.write"
"adding script using appendChild":
zombie.wants "http://localhost:3003/script/append"
+ "should change html": (browser)->
+ console.log browser.html()
"should run script": (browser)-> assert.equal browser.document.title, "Script appendChild"
).export(module)
View
2 spec/forms-spec.coffee
@@ -24,7 +24,7 @@ brains.get "/form", (req, res)-> res.send """
</select>
</label>
<label>Scary <input name="scary" type="radio" value="yes" id="field-scary"></label>
- <label>Not scary <input name="scary" type="radio" value="no" id="field-notscary" checked></label>
+ <label>Not scary <input name="scary" type="radio" value="no" id="field-notscary" checked="checked"></label>
<select name="state" id="field-state">
<option>alive</option>
View
1 spec/helpers.coffee
@@ -76,6 +76,7 @@ zombie.wants = (url, context)->
zombie.Browser.prototype.wants = (url, callback)->
brains.ready =>
+ #@debug true
@visit url, (err, browser)=>
callback err, this if callback
return
View
10 src/zombie/history.coffee
@@ -1,5 +1,6 @@
# Window history and location.
jsdom = require("jsdom")
+html = jsdom.dom.level3.html
http = require("http")
URL = require("url")
qs = require("querystring")
@@ -21,6 +22,7 @@ class Entry
@title = options.title
@pop = !!options.pop
+
# ## window.history
#
# Represents window.history.
@@ -53,9 +55,10 @@ class History
# resources, etc) and associate it with current document. From this
# point on the browser sees a new document, client register event
# handler for DOMContentLoaded/error.
- aug = jsdom.browserAugmentation(jsdom.dom.level3.html)
+ aug = jsdom.browserAugmentation(html)
document = new aug.HTMLDocument(url: URL.format(url), deferClose: false)
jsdom.applyDocumentFeatures document
+ document.write = html.HTMLDocument.prototype._write
window.document = document
# Make the actual request: called again when dealing with a redirect.
@@ -101,6 +104,9 @@ class History
process.nextTick -> makeRequest redirect, "GET"
else
error = "Could not load document at #{URL.format(url)}, got #{response.statusCode}"
+ document.open()
+ document.write error
+ document.close()
# onerror is the only reliable way we have to notify the
# application.
if error
@@ -234,7 +240,7 @@ class Location
# ## document.location => Location
#
# document.location is same as window.location
-jsdom.dom.level3.core.HTMLDocument.prototype.__defineGetter__ "location", -> @parentWindow.location
+html.HTMLDocument.prototype.__defineGetter__ "location", -> @parentWindow.location
exports.use = (browser)->
View
70 src/zombie/jsdom_patches.coffee
@@ -3,6 +3,7 @@ core = require("jsdom").dom.level3.core
URL = require("url")
vm = process.binding("evals")
http = require("http")
+html5 = require("html5").HTML5
# Event Handling
@@ -84,3 +85,72 @@ core.resourceLoader.download = (url, callback)->
callback null, data
request.on "error", (error)-> callback error
request.end()
+
+
+# Scripts
+# -------
+
+# Here we deal with four JSDOM issues:
+# - JSDOM assumes a SCRIPT element would have one text node, it may have
+# more, and in the second case, it has none.
+# - HTML5 creates the SCRIPT element first, then adds the script
+# contents to the element. We handle that by catching the
+# DOMCharacterDataModified event.
+# - Scripts can be added using document.write, so we need to patch
+# document.write so it adds the script instead of erasing the
+# document.
+# - ResourceQueue checks whether this.data is something, if this.data is
+# an empty string it does nothing when check() is called, and so never
+# completes loading when there are empty scripts.
+
+advise = (clazz, method, advice)->
+ proto = clazz.prototype
+ impl = proto[method]
+ proto[method] = ()->
+ args = Array.prototype.slice.call(arguments)
+ ret = impl.apply(this, arguments)
+ args.unshift ret
+ return advice.apply(this, args) || ret
+# DOMCharacterDataModified event fired when text is added to a
+# TextNode. This is a crappy implementation, a good one would old and
+# new values in the event.
+advise core.Text, "appendData", (value)->
+ doc = this.ownerDocument
+ ev = doc.createEvent("MutationEvents")
+ ev.initMutationEvent("DOMCharacterDataModified", true, false, this, this.nodeValue.replace(value, ""), this.nodeValue, null, null)
+ this.dispatchEvent ev
+
+# Add support for DOMCharacterDataModified, so we can execute a script
+# when its text contents is changed. Safari and Firefox support that.
+core.Document.prototype._elementBuilders["script"] = (doc, s)->
+ script = new core.HTMLScriptElement(doc, s)
+ script.addEventListener "DOMCharacterDataModified", (event)->
+ code = event.newValue.trim()
+ if code.length > 0
+ src = this.sourceLocation || {}
+ filename = src.file || this.ownerDocument.URL
+ if src
+ filename += ':' + src.line + ':' + src.col
+ filename += '<script>'
+ core.resourceLoader.enqueue(this, this._eval, filename)(null, code)
+ # Fix text property so it doesn't fail on empty contents
+ script.__defineGetter__ "text", ->
+ # Handle script with no child elements, but also force script
+ # content to never be empty (see bug in ResourceQueue)
+ (item.value for item in this.children).join("") + " "
+ return script
+
+core.HTMLDocument.prototype._write = (html)->
+ if @readyState == "loading" && @_parser
+ # During page loading, document.write appends to the current element
+ open = @_parser.tree.open_elements.last()
+ parser = new html5.Parser(document: this)
+ parser.parse_fragment(html, open)
+ else
+ # When loading page, parse from scratch.
+ # After page loading, empty document and parse from scratch.
+ @removeChild child for child in @children
+ @_parser = new html5.Parser(document: this)
+ @_parser.parse(html)
+ html
+core.HTMLDocument.prototype.writeln = (html)-> @write html + "\n"

0 comments on commit 778e0d8

Please sign in to comment.