diff --git a/Cargo.lock b/Cargo.lock
index 88aaeff7..ed763110 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -41,6 +41,12 @@ dependencies = [
"alloc-no-stdlib",
]
+[[package]]
+name = "allocator-api2"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
+
[[package]]
name = "anstream"
version = "0.6.20"
@@ -790,12 +796,6 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-[[package]]
-name = "ego-tree"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
-
[[package]]
name = "either"
version = "1.15.0"
@@ -958,6 +958,12 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
[[package]]
name = "foreign-types"
version = "0.3.2"
@@ -988,16 +994,6 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-[[package]]
-name = "futf"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
-dependencies = [
- "mac",
- "new_debug_unreachable",
-]
-
[[package]]
name = "futures"
version = "0.3.31"
@@ -1100,15 +1096,6 @@ dependencies = [
"slab",
]
-[[package]]
-name = "getopts"
-version = "0.2.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
-dependencies = [
- "unicode-width",
-]
-
[[package]]
name = "getrandom"
version = "0.2.16"
@@ -1212,6 +1199,17 @@ version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
[[package]]
name = "heck"
version = "0.5.0"
@@ -1277,16 +1275,6 @@ dependencies = [
"windows-sys 0.59.0",
]
-[[package]]
-name = "html5ever"
-version = "0.36.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e"
-dependencies = [
- "log",
- "markup5ever",
-]
-
[[package]]
name = "http"
version = "1.3.1"
@@ -1508,11 +1496,11 @@ dependencies = [
"hyper",
"hyper-util",
"log",
+ "lol_html",
"mime",
"reqwest",
"rustls",
"rustls-platform-verifier",
- "scraper",
"thiserror 2.0.16",
"tokio",
"url",
@@ -1574,7 +1562,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661"
dependencies = [
"equivalent",
- "hashbrown",
+ "hashbrown 0.15.5",
]
[[package]]
@@ -1748,27 +1736,29 @@ dependencies = [
]
[[package]]
-name = "lru-slab"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
-
-[[package]]
-name = "mac"
-version = "0.1.1"
+name = "lol_html"
+version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
+checksum = "5ff94cb6aef6ee52afd2c69331e9109906d855e82bd241f3110dfdf6185899ab"
+dependencies = [
+ "bitflags",
+ "cfg-if",
+ "cssparser",
+ "encoding_rs",
+ "foldhash",
+ "hashbrown 0.16.1",
+ "memchr",
+ "mime",
+ "precomputed-hash",
+ "selectors",
+ "thiserror 2.0.16",
+]
[[package]]
-name = "markup5ever"
-version = "0.36.1"
+name = "lru-slab"
+version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c"
-dependencies = [
- "log",
- "tendril",
- "web_atoms",
-]
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "memchr"
@@ -2640,21 +2630,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
-[[package]]
-name = "scraper"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93cecd86d6259499c844440546d02f55f3e17bd286e529e48d1f9f67e92315cb"
-dependencies = [
- "cssparser",
- "ego-tree",
- "getopts",
- "html5ever",
- "precomputed-hash",
- "selectors",
- "tendril",
-]
-
[[package]]
name = "security-framework"
version = "3.5.1"
@@ -2799,31 +2774,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-[[package]]
-name = "string_cache"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
-dependencies = [
- "new_debug_unreachable",
- "parking_lot",
- "phf_shared",
- "precomputed-hash",
- "serde",
-]
-
-[[package]]
-name = "string_cache_codegen"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
-dependencies = [
- "phf_generator",
- "phf_shared",
- "proc-macro2",
- "quote",
-]
-
[[package]]
name = "strsim"
version = "0.11.1"
@@ -2894,17 +2844,6 @@ version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba"
-[[package]]
-name = "tendril"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
-dependencies = [
- "futf",
- "mac",
- "utf-8",
-]
-
[[package]]
name = "thiserror"
version = "1.0.69"
@@ -3152,12 +3091,6 @@ version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
-[[package]]
-name = "unicode-width"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
-
[[package]]
name = "untrusted"
version = "0.7.1"
@@ -3187,12 +3120,6 @@ version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
-[[package]]
-name = "utf-8"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
-
[[package]]
name = "utf8_iter"
version = "1.0.4"
@@ -3361,18 +3288,6 @@ dependencies = [
"wasm-bindgen",
]
-[[package]]
-name = "web_atoms"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acd0c322f146d0f8aad130ce6c187953889359584497dac6561204c8e17bb43d"
-dependencies = [
- "phf",
- "phf_codegen",
- "string_cache",
- "string_cache_codegen",
-]
-
[[package]]
name = "webpki-root-certs"
version = "1.0.5"
diff --git a/impit-node/package.json b/impit-node/package.json
index a81480ae..8255bd30 100644
--- a/impit-node/package.json
+++ b/impit-node/package.json
@@ -54,13 +54,13 @@
"packageManager": "yarn@4.12.0",
"description": "Impit for JavaScript",
"optionalDependencies": {
- "impit-darwin-x64": "0.10.1",
"impit-darwin-arm64": "0.10.1",
- "impit-win32-x64-msvc": "0.10.1",
- "impit-win32-arm64-msvc": "0.10.1",
+ "impit-darwin-x64": "0.10.1",
+ "impit-linux-arm64-gnu": "0.10.1",
+ "impit-linux-arm64-musl": "0.10.1",
"impit-linux-x64-gnu": "0.10.1",
"impit-linux-x64-musl": "0.10.1",
- "impit-linux-arm64-gnu": "0.10.1",
- "impit-linux-arm64-musl": "0.10.1"
+ "impit-win32-arm64-msvc": "0.10.1",
+ "impit-win32-x64-msvc": "0.10.1"
}
-}
\ No newline at end of file
+}
diff --git a/impit-node/test/basics.test.ts b/impit-node/test/basics.test.ts
index 57e58ebe..fbcb65ae 100644
--- a/impit-node/test/basics.test.ts
+++ b/impit-node/test/basics.test.ts
@@ -480,13 +480,27 @@ describe.each([
t.expect(text).toContain('Herman Melville');
});
- test('.text() method works with decoding', async (t) => {
+ test('.text() decodes using Content-Type header charset param', async (t) => {
const response = await impit.fetch(new URL(routes.charset.path, "http://127.0.0.1:3001").href);
const text: string = await response.text();
t.expect(text).toContain(routes.charset.bodyString);
});
+ test('.text() decodes using prescan', async (t) => {
+ const response = await impit.fetch(new URL(routes.charsetMetaCharset.path, "http://127.0.0.1:3001").href);
+ const text: string = await response.text();
+
+ t.expect(text).toContain(routes.charsetMetaCharset.bodyString);
+ });
+
+ test('.text() decodes using prescan', async (t) => {
+ const response = await impit.fetch(new URL(routes.charsetMetaHttpEquiv.path, "http://127.0.0.1:3001").href);
+ const text: string = await response.text();
+
+ t.expect(text).toContain(routes.charsetMetaHttpEquiv.bodyString);
+ });
+
test('.json() method works', async (t) => {
const response = await impit.fetch(getHttpBinUrl('/json'));
const json = await response.json();
diff --git a/impit-node/test/mock.server.ts b/impit-node/test/mock.server.ts
index 4a2fdbe0..9faa17e7 100644
--- a/impit-node/test/mock.server.ts
+++ b/impit-node/test/mock.server.ts
@@ -2,11 +2,23 @@ import express from 'express';
import { Server } from 'http';
import { Server as ProxyServer } from 'proxy-chain';
+// "Příliš žluťoučký kůň úpěl ďábelské ódy" encoded in Windows-1250
+const WIN1250_BODY = Buffer.from([0x50, 0xf8, 0xed, 0x6c, 0x69, 0x9a, 0x20, 0x9e, 0x6c, 0x75, 0x9d, 0x6f, 0x75, 0xe8, 0x6b, 0xfd, 0x20, 0x6b, 0xf9, 0xf2, 0x20, 0xfa, 0x70, 0xec, 0x6c, 0x20, 0xef, 0xe1, 0x62, 0x65, 0x6c, 0x73, 0x6b, 0xe9, 0x20, 0xf3, 0x64, 0x79]);
+const WIN1250_STRING = 'Příliš žluťoučký kůň úpěl ďábelské ódy';
+
export const routes = {
charset: {
path: '/charset',
- bodyBuffer: Buffer.from([0x50, 0xf8, 0xed, 0x6c, 0x69, 0x9a, 0x20, 0x9e, 0x6c, 0x75, 0x9d, 0x6f, 0x75, 0xe8, 0x6b, 0xfd, 0x20, 0x6b, 0xf9, 0xf2, 0x20, 0xfa, 0x70, 0xec, 0x6c, 0x20, 0xef, 0xe1, 0x62, 0x65, 0x6c, 0x73, 0x6b, 0xe9, 0x20, 0xf3, 0x64, 0x79]),
- bodyString: 'Příliš žluťoučký kůň úpěl ďábelské ódy'
+ bodyBuffer: WIN1250_BODY,
+ bodyString: WIN1250_STRING,
+ },
+ charsetMetaCharset: {
+ path: '/charset/meta-charset',
+ bodyString: WIN1250_STRING,
+ },
+ charsetMetaHttpEquiv: {
+ path: '/charset/meta-http-equiv',
+ bodyString: WIN1250_STRING,
},
}
@@ -18,6 +30,26 @@ export async function runServer(port: number): Promise {
res.send(routes.charset.bodyBuffer);
});
+ app.get(routes.charsetMetaCharset.path, (req, res) => {
+ const html = Buffer.concat([
+ Buffer.from(''),
+ WIN1250_BODY,
+ Buffer.from(''),
+ ]);
+ res.writeHead(200, { 'Content-Type': 'text/html' });
+ res.end(html);
+ });
+
+ app.get(routes.charsetMetaHttpEquiv.path, (req, res) => {
+ const html = Buffer.concat([
+ Buffer.from(''),
+ WIN1250_BODY,
+ Buffer.from(''),
+ ]);
+ res.writeHead(200, { 'Content-Type': 'text/html' });
+ res.end(html);
+ });
+
app.get('/socket', (req, res) => {
const socket = req.socket;
const clientAddress = socket.remoteAddress;
diff --git a/impit-node/yarn.lock b/impit-node/yarn.lock
index 2dc578b8..e1c3b0d0 100644
--- a/impit-node/yarn.lock
+++ b/impit-node/yarn.lock
@@ -2406,58 +2406,58 @@ __metadata:
languageName: node
linkType: hard
-"impit-darwin-arm64@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-darwin-arm64@npm:0.10.0"
+"impit-darwin-arm64@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-darwin-arm64@npm:0.10.1"
conditions: os=darwin & cpu=arm64
languageName: node
linkType: hard
-"impit-darwin-x64@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-darwin-x64@npm:0.10.0"
+"impit-darwin-x64@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-darwin-x64@npm:0.10.1"
conditions: os=darwin & cpu=x64
languageName: node
linkType: hard
-"impit-linux-arm64-gnu@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-linux-arm64-gnu@npm:0.10.0"
+"impit-linux-arm64-gnu@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-linux-arm64-gnu@npm:0.10.1"
conditions: os=linux & cpu=arm64 & libc=glibc
languageName: node
linkType: hard
-"impit-linux-arm64-musl@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-linux-arm64-musl@npm:0.10.0"
+"impit-linux-arm64-musl@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-linux-arm64-musl@npm:0.10.1"
conditions: os=linux & cpu=arm64 & libc=musl
languageName: node
linkType: hard
-"impit-linux-x64-gnu@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-linux-x64-gnu@npm:0.10.0"
+"impit-linux-x64-gnu@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-linux-x64-gnu@npm:0.10.1"
conditions: os=linux & cpu=x64 & libc=glibc
languageName: node
linkType: hard
-"impit-linux-x64-musl@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-linux-x64-musl@npm:0.10.0"
+"impit-linux-x64-musl@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-linux-x64-musl@npm:0.10.1"
conditions: os=linux & cpu=x64 & libc=musl
languageName: node
linkType: hard
-"impit-win32-arm64-msvc@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-win32-arm64-msvc@npm:0.10.0"
+"impit-win32-arm64-msvc@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-win32-arm64-msvc@npm:0.10.1"
conditions: os=win32 & cpu=arm64
languageName: node
linkType: hard
-"impit-win32-x64-msvc@npm:0.10.0":
- version: 0.10.0
- resolution: "impit-win32-x64-msvc@npm:0.10.0"
+"impit-win32-x64-msvc@npm:0.10.1":
+ version: 0.10.1
+ resolution: "impit-win32-x64-msvc@npm:0.10.1"
conditions: os=win32 & cpu=x64
languageName: node
linkType: hard
@@ -2470,14 +2470,14 @@ __metadata:
"@types/express": "npm:^5.0.0"
"@types/node": "npm:^24.0.0"
express: "npm:^5.0.0"
- impit-darwin-arm64: "npm:0.10.0"
- impit-darwin-x64: "npm:0.10.0"
- impit-linux-arm64-gnu: "npm:0.10.0"
- impit-linux-arm64-musl: "npm:0.10.0"
- impit-linux-x64-gnu: "npm:0.10.0"
- impit-linux-x64-musl: "npm:0.10.0"
- impit-win32-arm64-msvc: "npm:0.10.0"
- impit-win32-x64-msvc: "npm:0.10.0"
+ impit-darwin-arm64: "npm:0.10.1"
+ impit-darwin-x64: "npm:0.10.1"
+ impit-linux-arm64-gnu: "npm:0.10.1"
+ impit-linux-arm64-musl: "npm:0.10.1"
+ impit-linux-x64-gnu: "npm:0.10.1"
+ impit-linux-x64-musl: "npm:0.10.1"
+ impit-win32-arm64-msvc: "npm:0.10.1"
+ impit-win32-x64-msvc: "npm:0.10.1"
proxy-chain: "npm:^2.5.9"
socks-server-lib: "npm:^0.0.3"
tough-cookie: "npm:^6.0.0"
diff --git a/impit/Cargo.toml b/impit/Cargo.toml
index f2328e3c..e5cb51fc 100644
--- a/impit/Cargo.toml
+++ b/impit/Cargo.toml
@@ -13,7 +13,7 @@ log = "0.4.22"
mime = "0.3.17"
reqwest = { version="0.13.1", features = ["json", "gzip", "brotli", "zstd", "deflate", "http3", "cookies", "stream", "socks"] }
rustls = { version="0.23.36", features=["impit"] }
-scraper = "0.25.0"
+lol_html = "2.7.2"
thiserror = "2.0.12"
tokio = { version="1.40.0", features = ["full"] }
url = "2.5.2"
diff --git a/impit/src/response_parsing/mod.rs b/impit/src/response_parsing/mod.rs
index 60d9dcb4..9874e1de 100644
--- a/impit/src/response_parsing/mod.rs
+++ b/impit/src/response_parsing/mod.rs
@@ -25,7 +25,7 @@ fn bom_sniffing(bytes: &[u8]) -> Option {
None
}
-/// A lazy implementation of the BOM sniffing algorithm, using `scraper` to parse the HTML and extract the encoding.
+/// A lazy implementation of the prescan algorithm, using `lol_html` to parse the HTML and extract the encoding.
///
/// See more details at https://html.spec.whatwg.org/#prescan-a-byte-stream-to-determine-its-encoding
fn prescan_bytestream(bytes: &[u8]) -> Option {
@@ -38,34 +38,50 @@ fn prescan_bytestream(bytes: &[u8]) -> Option {
let ascii_body = encoding::all::ASCII
.decode(&bytes[0..limit], encoding::DecoderTrap::Replace)
.unwrap();
- let dom = scraper::Html::parse_document(&ascii_body);
- let meta = dom
- .select(&scraper::Selector::parse("meta[charset]").unwrap())
- .next();
-
- if let Some(meta) = meta {
- if let Some(charset) = meta.value().attr("charset") {
- return encoding::label::encoding_from_whatwg_label(charset);
- }
- }
-
- let meta = dom
- .select(&scraper::Selector::parse("meta[http-equiv=content-type]").unwrap())
- .next();
-
- if let Some(meta) = meta {
- if let Some(content) = meta.value().attr("content") {
- let content_type = ContentType::from(content);
-
- return match content_type {
- Ok(content_type) => content_type.into(),
- Err(_) => None,
- };
- }
- }
-
- None
+ let found = std::rc::Rc::new(std::cell::RefCell::new(None::));
+ let found_charset = std::rc::Rc::clone(&found);
+ let found_http_equiv = std::rc::Rc::clone(&found);
+
+ let mut rewriter = lol_html::HtmlRewriter::new(
+ lol_html::Settings {
+ element_content_handlers: vec![
+ lol_html::element!("meta[charset]", move |el| {
+ if found_charset.borrow().is_none() {
+ if let Some(charset) = el.get_attribute("charset") {
+ *found_charset.borrow_mut() =
+ encoding::label::encoding_from_whatwg_label(&charset);
+ }
+ }
+ Ok(())
+ }),
+ lol_html::element!("meta[http-equiv]", move |el| {
+ if found_http_equiv.borrow().is_none() {
+ let is_content_type = el
+ .get_attribute("http-equiv")
+ .map(|v| v.eq_ignore_ascii_case("content-type"))
+ .unwrap_or(false);
+ if is_content_type {
+ if let Some(content) = el.get_attribute("content") {
+ if let Ok(ct) = ContentType::from(&content) {
+ *found_http_equiv.borrow_mut() = ct.into();
+ }
+ }
+ }
+ }
+ Ok(())
+ }),
+ ],
+ ..lol_html::Settings::default()
+ },
+ |_: &[u8]| {},
+ );
+
+ rewriter.write(ascii_body.as_bytes()).ok()?;
+ rewriter.end().ok()?;
+
+ let result = *found.borrow();
+ result
}
/// Converts a vector of bytes to a [`String`] using the provided encoding.