diff --git a/src/html/WizHtmlConverter.cpp b/src/html/WizHtmlConverter.cpp index 9db81015..423c7941 100644 --- a/src/html/WizHtmlConverter.cpp +++ b/src/html/WizHtmlConverter.cpp @@ -3,6 +3,7 @@ #include #include #include +#include using namespace Utils::Gumbo; @@ -26,8 +27,9 @@ QString WizHtmlConverter::toMarkdown() return ""; convert_to_markdown(m_parser->output()->root); + flush(); - return QString::fromStdString(output); + return m_lines.join(""); } std::string escape_special_chars(const std::string& str) { @@ -72,7 +74,32 @@ std::string escape_special_chars(const std::string& str) { return oss.str(); } -void convert_structure(GumboNode* node, std::string& output) +std::string WizHtmlConverter::processTextNode(GumboNode* node) +{ + std::string puretext = escape_special_chars(node->v.text.text); + + // Collapse leading whitespace into one space + std::size_t start_pos = puretext.find_first_not_of(" \t\r\n"); + if (start_pos != std::string::npos && start_pos > 0) { + puretext.replace(0, start_pos, " "); + } + + // Collapse trailing whitespace into one space + std::size_t end_pos = puretext.find_last_not_of(" \t\r\n"); + if (end_pos != std::string::npos && end_pos < puretext.length() - 1) { + puretext.replace(end_pos + 1, puretext.length() - end_pos - 1, " "); + } + + // Replace consecutive whitespace with a single space + auto end = std::unique(puretext.begin(), puretext.end(), [](char a, char b) { + return std::isspace(a) && std::isspace(b); + }); + puretext.erase(end, puretext.end()); + + return puretext; +} + +void WizHtmlConverter::processBlockStructure(GumboNode* node) { if (node->type == GUMBO_NODE_TEXT || node->type == GUMBO_NODE_ELEMENT) { @@ -103,10 +130,7 @@ void convert_structure(GumboNode* node, std::string& output) case GUMBO_TAG_OL: case GUMBO_TAG_UL: { - if (node->parent->v.element.tag == GUMBO_TAG_BLOCKQUOTE) - output.append(">\n"); - else - output.append("\n"); + linebreak(); break; } default: @@ -114,40 +138,90 @@ void convert_structure(GumboNode* node, std::string& output) } } } - - if (node->parent->v.element.tag == GUMBO_TAG_BLOCKQUOTE) - output.append("> "); } - } } -// Recursive function to convert an HTML element into Markdown syntax -void WizHtmlConverter::convert_to_markdown(GumboNode* node) { - convert_structure(node, output); +void WizHtmlConverter::append(const std::string &str) +{ + m_current.append(str); +} - if (node->type == GUMBO_NODE_TEXT) { - std::string puretext = escape_special_chars(node->v.text.text); +void WizHtmlConverter::linebreak(size_t n) +{ + m_current.append(std::string(n, '\n')); - // Collapse leading whitespace into one space - std::size_t start_pos = puretext.find_first_not_of(" \t\r\n"); - if (start_pos != std::string::npos && start_pos > 0) { - puretext.replace(0, start_pos, " "); - } + if (!m_nestedBlock.isEmpty()) { + std::string prefix; + auto e = m_nestedBlock.constBegin(); + while (e != m_nestedBlock.constEnd()) { + switch (*e) { + case GUMBO_TAG_BLOCKQUOTE: + { + prefix.append("> "); + ++e; + break; + } + case GUMBO_TAG_OL: + case GUMBO_TAG_UL: + { + // Skip first list level + GumboTag last = *e; + while (++e != m_nestedBlock.constEnd() && + (*e == GUMBO_TAG_OL || *e == GUMBO_TAG_UL)) { + last = *e; + prefix.append("\t"); + } - // Collapse trailing whitespace into one space - std::size_t end_pos = puretext.find_last_not_of(" \t\r\n"); - if (end_pos != std::string::npos && end_pos < puretext.length() - 1) { - puretext.replace(end_pos + 1, puretext.length() - end_pos - 1, " "); + if (m_listElemPrefixed.last()) { + prefix.append(last == GUMBO_TAG_UL ? " " : + std::string(std::to_string(m_listElemNo.last()).size(), ' ') + " "); + } else { + prefix.append(last == GUMBO_TAG_UL ? "* " : + std::to_string(m_listElemNo.last()) + ". "); + m_listElemPrefixed.last() = true; + } + + break; + } + default: + ++e; + break; + } } - // Replace consecutive whitespace with a single space - auto end = std::unique(puretext.begin(), puretext.end(), [](char a, char b) { - return std::isspace(a) && std::isspace(b); - }); - puretext.erase(end, puretext.end()); + flush(prefix); + } else { + flush(); + } +} + +void WizHtmlConverter::flush() +{ + m_lines << QString::fromStdString(m_current); + m_current.clear(); +} + +void WizHtmlConverter::flush(const std::string &prefix) +{ + m_lines << QString::fromStdString(prefix + m_current); + m_current.clear(); +} + +void WizHtmlConverter::processChildren(GumboNode* node) +{ + GumboVector* children = &node->v.element.children; + for (unsigned int i = 0; i < children->length; ++i) { + convert_to_markdown(static_cast(children->data[i])); + } +} + +// Recursive function to convert an HTML element into Markdown syntax +void WizHtmlConverter::convert_to_markdown(GumboNode* node) { + processBlockStructure(node); - output.append(puretext); + if (node->type == GUMBO_NODE_TEXT) { + append(processTextNode(node)); } else if (node->type == GUMBO_NODE_ELEMENT) { // Markdown syntax: https://www.markdownguide.org/basic-syntax/ switch (node->v.element.tag) { @@ -159,29 +233,24 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { case GUMBO_TAG_H5: case GUMBO_TAG_H6: { - output.append(std::string(node->v.element.tag - GUMBO_TAG_H1 + 1, '#')); - output.append(" "); - GumboVector* heading_children = &node->v.element.children; - for (unsigned int i = 0; i < heading_children->length; ++i) { - convert_to_markdown(static_cast(heading_children->data[i])); - } - output.append("\n"); + append(std::string(node->v.element.tag - GUMBO_TAG_H1 + 1, '#')); + append(" "); + processChildren(node); + linebreak(); break; } // Paragraphs case GUMBO_TAG_P: case GUMBO_TAG_DIV: { - GumboVector* paragraph_children = &node->v.element.children; - for (unsigned int i = 0; i < paragraph_children->length; ++i) { - convert_to_markdown(static_cast(paragraph_children->data[i])); - } - output.append("\n"); + processChildren(node); + linebreak(); break; } case GUMBO_TAG_BR: { - output.append(" \n"); + append(" "); + linebreak(); break; } // Emphasis @@ -189,29 +258,40 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { case GUMBO_TAG_I: case GUMBO_TAG_CITE: { - output.append("*"); - GumboVector* emphasis_children = &node->v.element.children; - for (unsigned int i = 0; i < emphasis_children->length; ++i) { - convert_to_markdown(static_cast(emphasis_children->data[i])); - } - output.append("*"); + append("*"); + processChildren(node); + append("*"); break; } case GUMBO_TAG_STRONG: case GUMBO_TAG_B: { - output.append("**"); - GumboVector* strong_children = &node->v.element.children; - for (unsigned int i = 0; i < strong_children->length; ++i) { - convert_to_markdown(static_cast(strong_children->data[i])); - } - output.append("**"); + append("**"); + processChildren(node); + append("**"); + break; + } + case GUMBO_TAG_U: + case GUMBO_TAG_INS: + { + auto name = std::string(gumbo_normalized_tagname(node->v.element.tag)); + append("<" + name + ">"); + processChildren(node); + append(""); + break; + } + case GUMBO_TAG_DEL: + { + append("~~"); + processChildren(node); + append("~~"); break; } // Blockquotes case GUMBO_TAG_BLOCKQUOTE: { m_blockquoteLevel++; + m_nestedBlock.push(GUMBO_TAG_BLOCKQUOTE); if (node->v.element.children.length > 0) { // Convert the contents of the blockquote to Markdown recursively. for (unsigned int i = 0; i < node->v.element.children.length; ++i) { @@ -220,6 +300,8 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { } } m_blockquoteLevel--; + auto tag = m_nestedBlock.pop(); + Q_ASSERT(tag == GUMBO_TAG_BLOCKQUOTE); break; } // Lists @@ -228,27 +310,28 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { { // Enter new list block m_listLevel++; + bool ordered = node->v.element.tag == GUMBO_TAG_OL; + m_nestedBlock.push(ordered ? GUMBO_TAG_OL : GUMBO_TAG_UL); GumboVector* list_children = &node->v.element.children; unsigned int liNo = 0; for (unsigned int i = 0; i < list_children->length; ++i) { GumboNode* child = static_cast(list_children->data[i]); if (child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_LI) { - if (++liNo > 1 && m_blockquoteLevel > 0) { - int rep = m_blockquoteLevel; - while(rep-- > 0) - output.append("> "); - } - - output.append(std::string(m_listLevel, '\t')); - output.append(node->v.element.tag == GUMBO_TAG_UL ? - "* " : std::to_string(liNo) + ". "); + ++liNo; + m_listElemNo.push(liNo); + m_listElemPrefixed.push(false); + convert_to_markdown(child); + m_listElemNo.pop(); + m_listElemPrefixed.pop(); + } else { + convert_to_markdown(child); } - - convert_to_markdown(child); } // Exit current list block m_listLevel--; + auto tag = m_nestedBlock.pop(); + Q_ASSERT(ordered ? tag == GUMBO_TAG_OL : tag == GUMBO_TAG_UL); break; } case GUMBO_TAG_LI: @@ -262,7 +345,7 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { if (child->type == GUMBO_NODE_ELEMENT && (child->v.element.tag == GUMBO_TAG_OL || child->v.element.tag == GUMBO_TAG_UL)) { - output.append("\n"); + linebreak(); } convert_to_markdown(child); @@ -273,9 +356,9 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { std::string key = "|" + tagname + "|"; bool isInline = kTagsNonBreakingInline.find(key) != std::string::npos; if (isInline) - output.append("\n"); + linebreak(); } else { - output.append("\n"); + linebreak(); } break; @@ -283,74 +366,69 @@ void WizHtmlConverter::convert_to_markdown(GumboNode* node) { // Code case GUMBO_TAG_CODE: { - output.append("`"); - GumboVector* code_children = &node->v.element.children; - for (unsigned int i = 0; i < code_children->length; ++i) { - convert_to_markdown(static_cast(code_children->data[i])); - } - output.append("`"); + append("`"); + processChildren(node); + append("`"); break; } // Horizontal Rules case GUMBO_TAG_HR: { - output.append("\n\n---\n\n"); + linebreak(2); + append("---"); + linebreak(2); break; } case GUMBO_TAG_PRE: { - output.append("```\n"); - GumboVector* pre_children = &node->v.element.children; - for (unsigned int i = 0; i < pre_children->length; ++i) { - convert_to_markdown(static_cast(pre_children->data[i])); - } - output.append("\n```\n"); + append("```"); + linebreak(); + processChildren(node); + linebreak(); + append("```"); + linebreak(); break; } // Links case GUMBO_TAG_A: { - output.append("["); - GumboVector* children = &node->v.element.children; - for (unsigned int i = 0; i < children->length; ++i) { - convert_to_markdown(static_cast(children->data[i])); - } - output.append("]("); + append("["); + processChildren(node); + append("]("); GumboAttribute* href_attr = gumbo_get_attribute(&node->v.element.attributes, "href"); - if (href_attr) { - output.append(href_attr->value); + if (href_attr) append(href_attr->value); + GumboAttribute* title_attr = gumbo_get_attribute(&node->v.element.attributes, "title"); + if (title_attr) { + append(" \""); + append(title_attr->value); + append("\""); } - output.append(")"); + append(")"); break; } // Images case GUMBO_TAG_IMG: { - output.append("!["); + append("!["); GumboAttribute* alt_attr = gumbo_get_attribute(&node->v.element.attributes, "alt"); - if (alt_attr) output.append(alt_attr->value); - output.append("]("); + if (alt_attr) append(alt_attr->value); + append("]("); GumboAttribute* src_attr = gumbo_get_attribute(&node->v.element.attributes, "src"); - if (src_attr) output.append(src_attr->value); + if (src_attr) append(src_attr->value); GumboAttribute* title_attr = gumbo_get_attribute(&node->v.element.attributes, "title"); if (title_attr) { - output.append(" \""); - output.append(title_attr->value); - output.append("\""); + append(" \""); + append(title_attr->value); + append("\""); } - output.append(")"); + append(")"); break; } //TODO: Table default: - { - GumboVector* default_children = &node->v.element.children; - for (unsigned int i = 0; i < default_children->length; ++i) { - convert_to_markdown(static_cast(default_children->data[i])); - } + processChildren(node); break; } - } } } diff --git a/src/html/WizHtmlConverter.h b/src/html/WizHtmlConverter.h index f20ee07d..00405fce 100644 --- a/src/html/WizHtmlConverter.h +++ b/src/html/WizHtmlConverter.h @@ -2,9 +2,12 @@ #define HTML_WIZHTMLCONVERTER_H #include +#include +#include #include "WizGumboHelper.h" + namespace Utils { class WizHtmlConverter @@ -17,12 +20,24 @@ class WizHtmlConverter private: void convert_to_markdown(GumboNode* node); + void processChildren(GumboNode* node); + void processBlockStructure(GumboNode* node); + std::string processTextNode(GumboNode* node); + + void append(const std::string &str); + void linebreak(size_t n = 1); + void flush(); + void flush(const std::string &prefix); private: Utils::Gumbo::GumboParser *m_parser; - std::string output; + std::string m_current; + QStringList m_lines; int m_listLevel; int m_blockquoteLevel; + QStack m_nestedBlock; + QStack m_listElemNo; + QStack m_listElemPrefixed ; }; } // Utils diff --git a/tests/TestHtml/test-WizHtmlTool.cpp b/tests/TestHtml/test-WizHtmlTool.cpp index a7764cb3..059aa30c 100644 --- a/tests/TestHtml/test-WizHtmlTool.cpp +++ b/tests/TestHtml/test-WizHtmlTool.cpp @@ -403,8 +403,22 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data() QTest::newRow("Line breaks") << wrapHTML("

This is the first line.
And this is the second line.

") << "This is the first line. \nAnd this is the second line.\n"; - - // Test Emphasis + QTest::newRow("Breaks in paragraphs") + << wrapHTML(R"( +

foo

+

bar

+


+

Above is a break!

+ )") + << "## foo\n" + "\n" + "bar\n" + "\n" + " \n\n" //

will add a more linebreak. + "\n" + "Above is a break!\n"; + + // Test Formating QTest::newRow("Bold") << wrapHTML("I just love bold text.") << "I just love **bold text**."; @@ -423,12 +437,45 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data() QTest::newRow("Bold and Italic without space") << wrapHTML("This is reallyveryimportant text.") << "This is really***very***important text."; + QTest::newRow("Mutiple complex inline formating") + << wrapHTML(R"( +

+ Bold
+ Italic
+ Bola and italic
+ Strikethrought
+ Underline
+ Underline2 +

+ )") + << "**Bold** \n" + "*Italic* \n" + "***Bola and italic*** \n" + "~~Strikethrought~~ \n" + "Underline \n" + "Underline2\n"; + + // Test links + QTest::newRow("Normal link with a title") + << R"(

Example

)" + << "[Example](https://example.com \"Example website\")\n"; + QTest::newRow("Image within link") + << R"( +

+ + ExampleKlick the image for the preview +

+ )" + << "[![Example](/home/tim/qtprojegt/MarkdownEdit/doc/images/Example.png)" + "Klick the image for the preview](https://software-made-easy.github.io" + "/MarkdownEdit/markdownedit.html)\n"; // Test Blockquotes QTest::newRow("Simple blockquotes") << R"(
-

Dorothy followed her through many of the beautiful rooms in her castle.

+

Dorothy followed her through many of the beautiful rooms in her castle.

)" << "> Dorothy followed her through many of the beautiful rooms in her castle.\n"; @@ -440,7 +487,7 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data()

The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.

)" << "> Dorothy followed her through many of the beautiful rooms in her castle.\n" - ">\n" + "> \n" "> The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.\n"; QTest::newRow("Nested Blockquotes") << R"( @@ -453,8 +500,44 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data() )" << "> Dorothy followed her through many of the beautiful rooms in her castle.\n" - ">\n" + "> \n" "> > The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.\n"; + QTest::newRow("Multiple level of nested blockquotes") + << R"( +
+

1

+
+

1.2

+
+
+
+

1.3

+
+
+
+ )" + << "> > 1\n" + "> \n" + "> > > 1.2\n" + "> \n" + "> > > > 1.3\n"; + QTest::newRow("Multiple level of nested blockquotes 2") + << R"( +
+
+

1

+
+

1.2

+

1.3

+
+
+
+ )" + << "> > 1\n" + "> > \n" + "> > > 1.2\n" + "> > > \n" + "> > > > 1.3\n"; QTest::newRow("Blockquotes with Other Elements") << R"(
@@ -469,10 +552,10 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data()
)" << "> #### The quarterly results look great!\n" - ">\n" + "> \n" "> * Revenue was off the chart.\n" "> * Profits were higher than ever.\n" - ">\n" + "> \n" "> *Everything* is going according to **plan**.\n"; QTest::newRow("Nested Blockquotes with list") << R"( @@ -488,9 +571,55 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data() )" << "> Dorothy followed her through many of the beautiful rooms in her castle.\n" - ">\n" + "> \n" "> > * Revenue was off the chart.\n" "> > * Profits were higher than ever.\n"; + QTest::newRow("Complex Nested Blockquotes with list") + << R"( +
+

1

+
+

2

+
+

3

+
+
+

1 again

+
+
    +
  • +
    +

    in list

    +
    +
  • +
+ )" + << "> 1\n" + "> \n" + "> > 2\n" + "> > \n" + "> > > 3\n" + "> \n" + "> 1 again\n" + "\n" + "* > in list\n"; + QTest::newRow("Blockquotes within list") + << R"( +
    +
  • +
    +

    in list 1

    +
    +
  • +
  • +
    +

    in list 2

    +
    +
  • +
+ )" + << "* > in list 1\n" + "* > in list 2\n"; // Test Lists QTest::newRow("Test ordered lists") @@ -570,6 +699,61 @@ void TestWizHtmlTool::check_WizHtmlToMarkdown_data() )") << "* 1968. A great year!\n" "* I think 1969 was second best.\n"; + QTest::newRow("Multiple lines in same list element") + << wrapHTML(R"( +
    +
  • +

    A paragraph

    +

    Same paragraph

    +
  • +
  • +

    A paragraph

    +

    Same paragraph

    +
  • +
+ )") + << "* A paragraph\n" + " \n" + " Same paragraph\n" + "* A paragraph\n" + " \n" + " Same paragraph\n"; + QTest::newRow("Simple nested list") + << R"( +
    +
  • foo
  • +
  • +
      +
    • bar
    • +
    • foo
    • +
    +
  • +
+ )" + << "* foo\n" + "* \n" // FIXME: blank line? + "\t* bar\n" + "\t* foo\n"; + QTest::newRow("List entry with break") + << R"( +
    +
  • list entry with
    break
  • +
  • +
      +
    • Another
      break
      foo
      bar
    • +
    +
  • +
  • Hello World
  • +
+ )" + << "* list entry with \n" + "\tbreak\n" + "* \n" + "\t* Another \n" + "\tbreak \n" + "\tfoo \n" + "\tbar\n" + "* Hello World\n"; // TODO: test structure controlling, such as line breaks }