Skip to content

Commit

Permalink
Merge pull request #21 from ririsoft/web-types
Browse files Browse the repository at this point in the history
Web types
  • Loading branch information
paolobarbolini committed Jul 24, 2020
2 parents 22069a1 + 8e2d43e commit c43a7bd
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 2 deletions.
13 changes: 11 additions & 2 deletions src/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pub enum MatcherType {
DOC,
FONT,
IMAGE,
TEXT,
VIDEO,
CUSTOM,
}
Expand All @@ -26,7 +27,7 @@ macro_rules! matcher_map {
};
}

// Order: Application, Image, Video, Audio, Font, Document, Archive.
// Order: Application, Image, Video, Audio, Font, Document, Archive, Text.
// The above order should be preserved when adding new types since
// it may affect match result and/or performances.
matcher_map!(
Expand Down Expand Up @@ -450,5 +451,13 @@ matcher_map!(
"application/zstd",
"zst",
matchers::archive::is_zst
)
),
// Text
(
MatcherType::TEXT,
"text/html",
"html",
matchers::text::is_html
),
(MatcherType::TEXT, "text/xml", "xml", matchers::text::is_xml)
);
1 change: 1 addition & 0 deletions src/matchers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pub mod audio;
pub mod doc;
pub mod font;
pub mod image;
pub mod text;
pub mod video;
91 changes: 91 additions & 0 deletions src/matchers/text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/// Returns whether a buffer is html data.
///
/// Conforms to [whatwg](https://mimesniff.spec.whatwg.org/)
/// specification.
pub fn is_html(buf: &[u8]) -> bool {
let values: &[&[u8]] = &[
b"<!DOCTYPE HTML",
b"<HTML",
b"<HEAD",
b"<SCRIPT",
b"<IFRAME",
b"<H1",
b"<DIV",
b"<FONT",
b"<TABLE",
b"<A",
b"<STYLE",
b"<TITLE",
b"<B",
b"<BODY",
b"<BR",
b"<P",
b"<!--",
];
let buf = trim_start_whitespaces(buf);

for val in values {
if starts_with_ignore_ascii_case(buf, val) && buf.len() > val.len() {
match buf[val.len()] {
// tag-terminitating byte
0x20 | 0x3E => return true,
_ => continue,
}
}
}

false
}

/// Returns whether a buffer is xml data.
///
/// Conforms to [whatwg](https://mimesniff.spec.whatwg.org/)
/// specification.
pub fn is_xml(buf: &[u8]) -> bool {
let val: &[u8] = b"<?xml";
let buf = trim_start_whitespaces(buf);
starts_with_ignore_ascii_case(buf, val)
}

/// Strip whitespaces at the beginning of the buffer.
///
/// Follows https://mimesniff.spec.whatwg.org
/// definition of whitespace.
fn trim_start_whitespaces(mut buf: &[u8]) -> &[u8] {
while !buf.is_empty() {
match buf[0] {
0x09 | 0x0A | 0x0C | 0x0D | 0x20 => buf = &buf[1..],
_ => break,
}
}
buf
}

fn starts_with_ignore_ascii_case(buf: &[u8], needle: &[u8]) -> bool {
buf.len() >= needle.len() && buf[..needle.len()].eq_ignore_ascii_case(needle)
}

#[cfg(test)]
mod tests {
use super::{is_html, trim_start_whitespaces};

#[test]
fn trim_whitespaces() {
let got = trim_start_whitespaces(&[0x09, 0x0A, 0x0C, 0x0D, 0x20, b'A', b'B', b'C']);
assert_eq!(got, b"ABC");

let got = trim_start_whitespaces(b"abc");
assert_eq!(got, b"abc");

let got = trim_start_whitespaces(&[]);
assert_eq!(got, &[]);
}

#[test]
fn html() {
assert_eq!(is_html(b"<"), false);
assert_eq!(is_html(b"<HTML"), false);
assert_eq!(is_html(b"<HTML "), true);
assert_eq!(is_html(b" <BODY>"), true);
}
}
13 changes: 13 additions & 0 deletions testdata/sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">

<head>
<meta charset="utf-8">
<title>Hello World</title>
</head>

<body>
<p>Hello World!</p>
</body>

</html>
5 changes: 5 additions & 0 deletions testdata/sample.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<note>
<to>World</to>
<body>Hello !</body>
</note>
20 changes: 20 additions & 0 deletions tests/text.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use infer::{MatcherType, Type};
mod common;

test_format!(
MatcherType::TEXT,
"text/html",
"html",
test_html,
test_html_embed,
"sample.html"
);

test_format!(
MatcherType::TEXT,
"text/xml",
"xml",
test_xml,
test_xml_embed,
"sample.xml"
);

0 comments on commit c43a7bd

Please sign in to comment.