/
parse.js
134 lines (116 loc) · 5.48 KB
/
parse.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
const {Cc, Ci} = require("chrome");
var request = require("request");
/* Parse a soup of HTML from a string. Code adapted from
* /mozilla/toolkit/components/places/src/nsMicrosummaryService.js#1696
*/
function loadHTMLText(html, uri, callback) {
var wM = Cc['@mozilla.org/appshell/window-mediator;1'].
getService(Ci.nsIWindowMediator);
var window = wM.getMostRecentWindow("navigator:browser");
if (!window)
return false;
var document = window.document;
var rootElement = document.documentElement;
// Create an iframe, make it hidden, and secure it against untrusted content.
var iframe = document.createElement('iframe');
iframe.setAttribute("collapsed", true);
iframe.setAttribute("type", "content");
// Insert the iframe into the window, creating the doc shell.
rootElement.appendChild(iframe);
// When we insert the iframe into the window, it immediately starts loading
// about:blank, which we don't need and could even hurt us (for example
// by triggering bugs like bug 344305), so cancel that load.
var webNav = iframe.docShell.QueryInterface(Ci.nsIWebNavigation);
webNav.stop(Ci.nsIWebNavigation.STOP_NETWORK);
// Turn off JavaScript and auth dialogs for security and other things
// to reduce network load.
// XXX We should also turn off CSS.
iframe.docShell.allowJavascript = false;
iframe.docShell.allowAuth = false;
iframe.docShell.allowPlugins = false;
iframe.docShell.allowMetaRedirects = false;
iframe.docShell.allowSubframes = false;
iframe.docShell.allowImages = false;
iframe.docShell.allowDNSPrefetch = false;
// Convert the HTML text into an input stream.
var converter = Cc["@mozilla.org/intl/scriptableunicodeconverter"].
createInstance(Ci.nsIScriptableUnicodeConverter);
converter.charset = "UTF-8";
var stream = converter.convertToInputStream(html);
// Set up a channel to load the input stream.
var channel = Cc["@mozilla.org/network/input-stream-channel;1"].
createInstance(Ci.nsIInputStreamChannel);
channel.setURI(uri);
channel.contentStream = stream;
// Load in the background so we don't trigger web progress listeners.
var request = channel.QueryInterface(Ci.nsIRequest);
request.loadFlags |= Ci.nsIRequest.LOAD_BACKGROUND;
// Specify the content type since we're not loading content from a server,
// so it won't get specified for us, and if we don't specify it ourselves,
// then Firefox will prompt the user to download content of "unknown type".
var baseChannel = channel.QueryInterface(Ci.nsIChannel);
baseChannel.contentType = "text/html";
// Load as UTF-8, which it'll always be, because XMLHttpRequest converts
// the text (i.e. XMLHTTPRequest.responseText) from its original charset
// to UTF-16, then the string input stream component converts it to UTF-8.
baseChannel.contentCharset = "UTF-8";
var parseHandler = {
handleEvent: function _handleEvent(event) {
event.target.removeEventListener("DOMContentLoaded", this, false);
callback(iframe.contentDocument);
}
};
// Register the parse handler as a load event listener and start the load.
// Listen for "DOMContentLoaded" instead of "load" because background loads
// don't fire "load" events.
iframe.addEventListener("DOMContentLoaded", parseHandler, true);
var uriLoader = Cc["@mozilla.org/uriloader;1"].getService(Ci.nsIURILoader);
uriLoader.openURI(channel, true, iframe.docShell);
}
exports.loadHTML = function(url, values, callback) {
request.Request({
url: url,
content: values,
onComplete: function(response) {
// Only way to parse HTML is by loading it in a hidden iframe
// See https://developer.mozilla.org/en/Parsing_HTML_From_Chrome
var ios = Cc["@mozilla.org/network/io-service;1"].
getService(Ci.nsIIOService);
loadHTMLText(response.text, ios.newURI(url, null, null), callback);
}
}).post();
};
exports.parseBookmarks = function(document) {
var top = document.getElementsByTagName("dl")[0];
// Parsing this can be tricky. <dt> contains the actual bookmark itself
// but sometimes it is followed by a <dd> which is a description for it.
// If we encounter a <dt> but no <dd> immediately following it, it means
// the user did not enter any 'notes' for the bookmark. Also note that
// the HTML is not well-formed (no closing tags) so we are relying on
// Gecko's tolerance for malformed HTML. Yay!
var bmks = [];
var nodes = top.childNodes;
for (var i = 0; i < nodes.length; i++) {
var cur = nodes[i];
if (cur.nodeName == "DT") {
var link = cur.getElementsByTagName('a')[0];
bmks.push({
'href': link.getAttribute('HREF'),
'date': link.getAttribute('ADD_DATE'),
'name': link.innerHTML
});
if (link.hasAttribute('TAGS')) {
bmks[bmks.length-1]['tags'] =
link.getAttribute('TAGS');
}
if (link.hasAttribute('LAST_MODIFIED')) {
bmks[bmks.length-1]['modified'] =
link.getAttribute('LAST_MODIFIED');
}
} else if (cur.nodeName == "DD") {
if (bmks[bmks.length-1])
bmks[bmks.length-1]['desc'] = cur.innerHTML;
}
}
return bmks;
};