diff --git a/src/error.rs b/src/error.rs index 20d5f2f..9775bba 100644 --- a/src/error.rs +++ b/src/error.rs @@ -19,9 +19,9 @@ impl std::error::Error for Error {} impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Error::IOError(e) => e.fmt(f), - Error::ParseError(e) => e.fmt(f), - Error::Utf8Error(e) => e.fmt(f), + Error::IOError(e) => write!(f, "IOError: {e}"), + Error::ParseError(e) => write!(f, "ParseError: {e}"), + Error::Utf8Error(e) => write!(f, "Utf8Error: {e}"), Error::Other(e) => e.fmt(f), } } diff --git a/src/lib.rs b/src/lib.rs index 03ea8c4..1bf77bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,232 +33,44 @@ //! - `a` //! - `img` -use std::{cell::RefCell, io::Cursor}; +use std::io::Cursor; -use comrak::{ - nodes::{Ast, AstNode, LineColumn, ListType, NodeHeading, NodeLink, NodeList, NodeValue}, - Arena, ComrakOptions, -}; +use comrak::{nodes::AstNode, Arena, ComrakOptions}; use html5ever::{ tendril::StrTendril, - tokenizer::{ - BufferQueue, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, - }, - Attribute, + tokenizer::{BufferQueue, Tokenizer, TokenizerOpts}, }; mod attributes; mod error; +mod sink; -use attributes::AttributeList; pub use error::Error; +use sink::Sink; -struct Sink<'a> { - error: Option, - arena: &'a Arena>, - stack: Vec<&'a AstNode<'a>>, -} - -impl<'a> Sink<'a> { - #[inline] - fn cur(&mut self) -> Result<&mut &'a AstNode<'a>, Error> { - self.stack - .last_mut() - .ok_or_else(move || "stack is empty".into()) - } +/// Options for rendering +pub struct RenderOptions { + /// When disabled, ParseErrors will be ignored + pub strict: bool, } -impl<'a> Sink<'a> { - pub fn new(arena: &'a Arena>) -> Self { - let root = arena.alloc(AstNode::new(RefCell::new(Ast::new( - NodeValue::Document, - LineColumn { line: 0, column: 0 }, - )))); - - Self { - error: None, - arena, - stack: vec![root], - } +impl Default for RenderOptions { + fn default() -> Self { + Self { strict: true } } } -#[inline] -fn node<'a>(value: NodeValue, line: usize) -> AstNode<'a> { - AstNode::new(RefCell::new(Ast::new( - value, - LineColumn { line, column: 0 }, - ))) -} - -#[inline] -fn heading<'a>(level: u8, line: usize) -> AstNode<'a> { - node( - NodeValue::Heading(NodeHeading { - level, - setext: false, - }), - line, - ) -} - -#[inline] -fn create_node<'a>( +/// Parse an HTML document into a [comrak](https://crates.io/crates/comrak) AST +pub fn parse_document<'a>( arena: &'a Arena>, - name: &str, - attrs: &[Attribute], - line: usize, -) -> Option<&'a AstNode<'a>> { - Some(match name { - "a" => arena.alloc(node( - NodeValue::Link(NodeLink { - url: attrs.get_or_default("href"), - title: attrs.get_or_default("title"), - }), - 1, - )), - "h1" => arena.alloc(heading(1, line)), - "h2" => arena.alloc(heading(2, line)), - "h3" => arena.alloc(heading(3, line)), - "h4" => arena.alloc(heading(4, line)), - "h5" => arena.alloc(heading(5, line)), - "h6" => arena.alloc(heading(6, line)), - "p" => arena.alloc(node(NodeValue::Paragraph, line)), - "ul" => arena.alloc(node( - NodeValue::List(NodeList { - list_type: ListType::Bullet, - bullet_char: b'-', - ..NodeList::default() - }), - line, - )), - "ol" => arena.alloc(node( - NodeValue::List(NodeList { - list_type: ListType::Ordered, - start: 1, - ..NodeList::default() - }), - line, - )), - "li" => arena.alloc(node(NodeValue::Item(NodeList::default()), line)), - "b" | "strong" => arena.alloc(node(NodeValue::Strong, line)), - "i" | "em" => arena.alloc(node(NodeValue::Emph, line)), - "img" => { - let image = arena.alloc(node( - NodeValue::Image(NodeLink { - url: attrs.get_or_default("src"), - title: attrs.get_or_default("title"), - }), - line, - )); - - if let Some(alt) = AttributeList::get(&attrs, "alt") { - let text_node = arena.alloc(node(NodeValue::Text(alt.to_string()), line)); - image.append(text_node); - } - - image - } - _ => return None, - }) -} - -#[inline] -fn is_self_closing(name: &str) -> bool { - matches!( - name, - "area" - | "base" - | "br" - | "col" - | "embed" - | "hr" - | "img" - | "input" - | "keygen" - | "link" - | "meta" - | "param" - | "source" - | "track" - | "wbr" - ) -} - -#[inline] -fn valid_elem(name: &str) -> bool { - matches!( - name, - "h1" | "h2" - | "h3" - | "h4" - | "h5" - | "h6" - | "p" - | "ul" - | "li" - | "ol" - | "a" - | "b" - | "strong" - | "i" - | "em" - | "img" - ) -} - -impl<'a> TokenSink for Sink<'a> { - type Handle = (); - - fn process_token(&mut self, token: Token, line: u64) -> TokenSinkResult { - let f = || { - match token { - Token::DoctypeToken(_) => {} - Token::TagToken(tag) => match tag.kind { - TagKind::StartTag => { - if let Some(node) = - create_node(self.arena, &tag.name, &tag.attrs, line as usize) - { - if tag.self_closing || is_self_closing(&tag.name) { - let parent = self.cur()?; - parent.append(node); - } else { - self.stack.push(node); - } - } - } - TagKind::EndTag if !valid_elem(&tag.name) || is_self_closing(&tag.name) => {} - TagKind::EndTag => { - let node = self.stack.pop().unwrap(); - let parent = self.cur()?; - - parent.append(node); - } - }, - Token::CommentToken(_) => {} - Token::CharacterTokens(s) => { - self.cur()? - .append(self.arena.alloc(node(NodeValue::Text(s.to_string()), 0))); - } - Token::NullCharacterToken => {} - Token::EOFToken => {} - Token::ParseError(err) => return Err(Error::ParseError(err)), - } - - Ok::<_, Error>(()) - }; - - if let Err(e) = f() { - self.error = Some(e); - self.end(); - } - - TokenSinkResult::Continue - } + input: String, +) -> Result<&'a AstNode<'a>, Error> { + parse_document_with_options(RenderOptions::default(), arena, input) } /// Parse an HTML document into a [comrak](https://crates.io/crates/comrak) AST -pub fn parse_document<'a>( +pub fn parse_document_with_options<'a>( + opts: RenderOptions, arena: &'a Arena>, input: String, ) -> Result<&'a AstNode<'a>, Error> { @@ -266,24 +78,22 @@ pub fn parse_document<'a>( let mut input = BufferQueue::new(); input.push_back(chunk.try_reinterpret().map_err(|_| "buffer invalid")?); - let mut tok = Tokenizer::new(Sink::new(arena), TokenizerOpts::default()); + let mut tok = Tokenizer::new(Sink::new(arena, opts.strict), TokenizerOpts::default()); let _ = tok.feed(&mut input); tok.end(); - let root = tok.sink.stack.pop().ok_or("stack is empty")?; - assert!(tok.sink.stack.is_empty()); - - if let Some(e) = tok.sink.error { - return Err(e); - } - - Ok(root) + tok.sink.finish() } /// Convert a HTML document into markdown (CommonMark) pub fn render(input: String) -> Result { + render_with_options(RenderOptions::default(), input) +} + +/// Convert a HTML document into markdown (CommonMark) with options +pub fn render_with_options(opts: RenderOptions, input: String) -> Result { let arena = Arena::new(); - let root = parse_document(&arena, input)?; + let root = parse_document_with_options(opts, &arena, input)?; let mut writer = Cursor::new(vec![]); comrak::format_commonmark(root, &ComrakOptions::default(), &mut writer)?; @@ -295,8 +105,8 @@ pub fn render(input: String) -> Result { mod tests { use super::*; - fn assert_render(input: impl Into, expected: impl AsRef) { - let output = render(input.into()).unwrap(); + fn assert_render(opts: RenderOptions, input: impl Into, expected: impl AsRef) { + let output = render_with_options(opts, input.into()).unwrap(); assert_eq!(output.as_str(), expected.as_ref()); } @@ -306,6 +116,7 @@ mod tests { for level in levels { assert_render( + RenderOptions::default(), format!("hello world"), format!("{} hello world\n", "#".repeat(level)), ); @@ -314,17 +125,26 @@ mod tests { #[test] fn test_paragraph() { - assert_render("

hello world

", "hello world\n"); + assert_render( + RenderOptions::default(), + "

hello world

", + "hello world\n", + ); } #[test] fn test_wrapped() { - assert_render("

hello world

", "hello world\n"); + assert_render( + RenderOptions::default(), + "

hello world

", + "hello world\n", + ); } #[test] fn test_unordered_list() { assert_render( + RenderOptions::default(), "
  • first item
  • second item
", "- first item\n- second item\n", ); @@ -333,6 +153,7 @@ mod tests { #[test] fn test_ordered_list() { assert_render( + RenderOptions::default(), "
  1. first item
  2. second item
", "1. first item\n2. second item\n", ); @@ -341,6 +162,7 @@ mod tests { #[test] fn test_link() { assert_render( + RenderOptions::default(), "example", "[example](https://example.com)\n", ); @@ -348,24 +170,46 @@ mod tests { #[test] fn test_strong() { - assert_render("hello world", "**hello world**\n"); - assert_render("hello world", "**hello world**\n"); + assert_render( + RenderOptions::default(), + "hello world", + "**hello world**\n", + ); + assert_render( + RenderOptions::default(), + "hello world", + "**hello world**\n", + ); } #[test] fn test_emphasis() { - assert_render("hello world", "*hello world*\n"); - assert_render("hello world", "*hello world*\n"); + assert_render( + RenderOptions::default(), + "hello world", + "*hello world*\n", + ); + assert_render( + RenderOptions::default(), + "hello world", + "*hello world*\n", + ); } #[test] fn test_img() { - assert_render("", "![](test.jpg)\n"); assert_render( + RenderOptions::default(), + "", + "![](test.jpg)\n", + ); + assert_render( + RenderOptions::default(), "", "![](test.jpg \"this is a test\")\n", ); assert_render( + RenderOptions::default(), "\"alt", "![alt test](test.jpg \"this is a test\")\n", ); @@ -373,7 +217,24 @@ mod tests { #[test] fn test_img_not_self_closing_issue() { - assert_render("", "![](test.jpg)\n"); - assert_render("", "![](test.jpg)\n"); + assert_render( + RenderOptions::default(), + "", + "![](test.jpg)\n", + ); + assert_render( + RenderOptions::default(), + "", + "![](test.jpg)\n", + ); + } + + #[test] + fn test_parse_invalid_html() { + assert_render( + RenderOptions { strict: false }, + "", + "![](test.jpg)\n", + ); } } diff --git a/src/sink.rs b/src/sink.rs new file mode 100644 index 0000000..7b33781 --- /dev/null +++ b/src/sink.rs @@ -0,0 +1,234 @@ +use std::cell::RefCell; + +use comrak::{ + nodes::{Ast, AstNode, LineColumn, ListType, NodeHeading, NodeLink, NodeList, NodeValue}, + Arena, +}; +use html5ever::{ + tokenizer::{TagKind, Token, TokenSink, TokenSinkResult}, + Attribute, +}; + +use crate::attributes::AttributeList; +use crate::error::Error; + +pub(crate) struct Sink<'a> { + strict: bool, + arena: &'a Arena>, + error: Option, + stack: Vec<&'a AstNode<'a>>, +} + +impl<'a> Sink<'a> { + #[inline] + fn cur(&mut self) -> Result<&mut &'a AstNode<'a>, Error> { + self.stack + .last_mut() + .ok_or_else(move || "stack is empty".into()) + } +} + +impl<'a> Sink<'a> { + pub fn new(arena: &'a Arena>, strict: bool) -> Self { + let root = arena.alloc(AstNode::new(RefCell::new(Ast::new( + NodeValue::Document, + LineColumn { line: 0, column: 0 }, + )))); + + Self { + error: None, + strict, + arena, + stack: vec![root], + } + } + + pub fn finish(mut self) -> Result<&'a AstNode<'a>, Error> { + if let Some(e) = self.error { + return Err(e); + } + + let root = self.stack.pop().ok_or("stack is empty")?; + assert!(self.stack.is_empty()); + + Ok(root) + } +} + +impl<'a> TokenSink for Sink<'a> { + type Handle = (); + + fn process_token(&mut self, token: Token, line: u64) -> TokenSinkResult { + let f = || { + match token { + Token::DoctypeToken(_) => {} + Token::TagToken(tag) => match tag.kind { + TagKind::StartTag => { + if let Some(node) = + create_node(self.arena, &tag.name, &tag.attrs, line as usize) + { + if tag.self_closing || is_self_closing(&tag.name) { + let parent = self.cur()?; + parent.append(node); + } else { + self.stack.push(node); + } + } + } + TagKind::EndTag if !valid_elem(&tag.name) || is_self_closing(&tag.name) => {} + TagKind::EndTag => { + let node = self.stack.pop().unwrap(); + let parent = self.cur()?; + + parent.append(node); + } + }, + Token::CommentToken(_) => {} + Token::CharacterTokens(s) => { + self.cur()? + .append(self.arena.alloc(node(NodeValue::Text(s.to_string()), 0))); + } + Token::NullCharacterToken => {} + Token::EOFToken => {} + Token::ParseError(err) => { + if self.strict { + return Err(Error::ParseError(err)); + } + } + } + + Ok::<_, Error>(()) + }; + + if let Err(e) = f() { + self.error = Some(e); + self.end(); + } + + TokenSinkResult::Continue + } +} + +#[inline] +fn node<'a>(value: NodeValue, line: usize) -> AstNode<'a> { + AstNode::new(RefCell::new(Ast::new( + value, + LineColumn { line, column: 0 }, + ))) +} + +#[inline] +fn heading<'a>(level: u8, line: usize) -> AstNode<'a> { + node( + NodeValue::Heading(NodeHeading { + level, + setext: false, + }), + line, + ) +} + +#[inline] +fn create_node<'a>( + arena: &'a Arena>, + name: &str, + attrs: &[Attribute], + line: usize, +) -> Option<&'a AstNode<'a>> { + Some(match name { + "a" => arena.alloc(node( + NodeValue::Link(NodeLink { + url: attrs.get_or_default("href"), + title: attrs.get_or_default("title"), + }), + 1, + )), + "h1" => arena.alloc(heading(1, line)), + "h2" => arena.alloc(heading(2, line)), + "h3" => arena.alloc(heading(3, line)), + "h4" => arena.alloc(heading(4, line)), + "h5" => arena.alloc(heading(5, line)), + "h6" => arena.alloc(heading(6, line)), + "p" => arena.alloc(node(NodeValue::Paragraph, line)), + "ul" => arena.alloc(node( + NodeValue::List(NodeList { + list_type: ListType::Bullet, + bullet_char: b'-', + ..NodeList::default() + }), + line, + )), + "ol" => arena.alloc(node( + NodeValue::List(NodeList { + list_type: ListType::Ordered, + start: 1, + ..NodeList::default() + }), + line, + )), + "li" => arena.alloc(node(NodeValue::Item(NodeList::default()), line)), + "b" | "strong" => arena.alloc(node(NodeValue::Strong, line)), + "i" | "em" => arena.alloc(node(NodeValue::Emph, line)), + "img" => { + let image = arena.alloc(node( + NodeValue::Image(NodeLink { + url: attrs.get_or_default("src"), + title: attrs.get_or_default("title"), + }), + line, + )); + + if let Some(alt) = AttributeList::get(&attrs, "alt") { + let text_node = arena.alloc(node(NodeValue::Text(alt.to_string()), line)); + image.append(text_node); + } + + image + } + _ => return None, + }) +} + +#[inline] +fn is_self_closing(name: &str) -> bool { + matches!( + name, + "area" + | "base" + | "br" + | "col" + | "embed" + | "hr" + | "img" + | "input" + | "keygen" + | "link" + | "meta" + | "param" + | "source" + | "track" + | "wbr" + ) +} + +#[inline] +fn valid_elem(name: &str) -> bool { + matches!( + name, + "h1" | "h2" + | "h3" + | "h4" + | "h5" + | "h6" + | "p" + | "ul" + | "li" + | "ol" + | "a" + | "b" + | "strong" + | "i" + | "em" + | "img" + ) +}