Skip to content

Commit

Permalink
Merge pull request soundasleep#47 from bartbutler/master
Browse files Browse the repository at this point in the history
Optimize/improve newline/whitespace handling
  • Loading branch information
soundasleep committed Mar 21, 2017
2 parents 21cdce4 + 9e8288b commit e3bd6d4
Show file tree
Hide file tree
Showing 8 changed files with 26,017 additions and 81 deletions.
151 changes: 83 additions & 68 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ public static function convert($html, $ignore_error = false) {
$html = str_replace(" ", " ", $html);
$html = str_replace("\xc2\xa0", " ", $html);

if (static::isOfficeDocument($html)) {
$is_office_document = static::isOfficeDocument($html);

if ($is_office_document) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
Expand All @@ -51,17 +53,15 @@ public static function convert($html, $ignore_error = false) {

$doc = static::getDocument($html, $ignore_error);

if (static::isOfficeDocument($html)) {
// remove office namespace
$doc = static::fixMSEncoding($doc, $ignore_error);
}

$output = static::iterateOverNode($doc);
$output = static::iterateOverNode($doc, null, false, $is_office_document);

// remove leading and trailing spaces on each line
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
$output = preg_replace("/ *\t */im", "\t", $output);

// unarmor pre blocks
$output = str_replace("\r", "\n", $output);

// remove unnecessary empty lines
$output = preg_replace("/\n\n\n*/im", "\n\n", $output);

Expand All @@ -88,34 +88,6 @@ static function fixNewlines($text) {
return $text;
}

/**
* Microsoft exchange emails often include HTML which, when passed through
* html2text, results in lots of double line returns everywhere.
*
* To fix this any element with a className of `msoNormal` (the standard
* classname in any Microsoft export or outlook for a paragraph that behaves
* like a line return) is changed to a line with a break `<br>` afterwards.
*
* This cleaned up document can then be processed as normal through Html2Text.
*
* @param DOMDocument $doc the document to clean up
* @return DOMDocument the modified document with less unnecessary paragraphs
*/
static function fixMSEncoding($doc, $ignore_error = false) {
$paras = $doc->getElementsByTagName('p');
for ($i = $paras->length - 1; $i >= 0; $i--) {
$para = $paras->item($i);
if ($para->getAttribute('class') == 'MsoNormal') {
$fragment = $doc->createDocumentFragment();
$fragment->appendChild($doc->createTextNode($para->nodeValue));
$fragment->appendChild($doc->createElement('br'));
$new_node = $para->parentNode->replaceChild($fragment, $para);
}
}

return static::getDocument($doc->saveHTML(), $ignore_error);
}

/**
* Parse HTML into a DOMDocument
*
Expand All @@ -127,12 +99,22 @@ static function getDocument($html, $ignore_error = false) {

$doc = new \DOMDocument();

$html = trim($html);

if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}

if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}

if ($ignore_error) {
$doc->strictErrorChecking = false;
$doc->recover = true;
Expand All @@ -159,47 +141,48 @@ static function isOfficeDocument($html) {
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}

static function isWhitespace($text) {
return strlen(trim($text, "\n\r\t ")) === 0;
}

static function nextChildName($node) {
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if ($nextNode instanceof \DOMElement && $nextNode != null) {
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}

return $nextName;
}

static function prevChildName($node) {
// get the previous child
$nextNode = $node->previousSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->previousSibling;
}
$nextName = null;
if ($nextNode instanceof \DOMElement && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}

return $nextName;
}
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {

static function iterateOverNode($node, $in_pre = false) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
return trim($node->wholeText, "\n\r\t ");
$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return str_replace("\n", "\r", $text);
} else {
return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType) {
Expand All @@ -211,15 +194,17 @@ static function iterateOverNode($node, $in_pre = false) {
return "";
}

$nextName = static::nextChildName($node);
$prevName = static::prevChildName($node);

$name = strtolower($node->nodeName);
$nextName = static::nextChildName($node);

// start whitespace
switch ($name) {
case "hr":
return "---------------------------------------------------------------\n";
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";

case "style":
case "head":
Expand Down Expand Up @@ -247,8 +232,24 @@ static function iterateOverNode($node, $in_pre = false) {
$output = "\t";
break;

case "tr":
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;

case "pre":
case "tr":
case "div":
// add one line
$output = "\n";
Expand All @@ -270,15 +271,29 @@ static function iterateOverNode($node, $in_pre = false) {
if (isset($node->childNodes)) {

$n = $node->childNodes->item(0);
$previousSiblingName = null;

while($n != null) {

$text = static::iterateOverNode($n, $in_pre || $name == 'pre');
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);

$output .= $text;
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
}
else {
$previousSiblingName = strtolower($n->nodeName);
}

$node->removeChild($n);
$n = $node->childNodes->item(0);

// suppress last br tag inside a node list
if ($n != null || $previousSiblingName != 'br') {
$output .= $text;
}
}
}

Expand All @@ -294,16 +309,17 @@ static function iterateOverNode($node, $in_pre = false) {
break;

case "p":
// add two lines
$output .= "\n\n";
break;

case "pre":
case "br":
// add one line
if ($nextName != "div")
$output .= "\n";
$output .= "\n";
break;

case "div":
// add one line only if the next child isn't a div
if ($nextName != "div" && $nextName != null)
$output .= "\n";
break;

case "a":
Expand Down Expand Up @@ -375,5 +391,4 @@ static function iterateOverNode($node, $in_pre = false) {

return $output;
}

}
4 changes: 4 additions & 0 deletions tests/Html2TextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ function testEmpty() {
$this->doTest("empty");
}

function testHugeMsoffice() {
$this->doTest("huge-msoffice");
}

/**
* @expectedException PHPUnit_Framework_Error_Warning
*/
Expand Down
26 changes: 14 additions & 12 deletions tests/basic.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
Hello, World!

This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.

Even mismatched tags.
A div
Another div
A div
within a div

Another line
Yet another line
Hello, World!

This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly.

Even mismatched tags.

A div
Another div
A div
within a div

Another line
Yet another line

[A link](http://foo.com)
1 change: 1 addition & 0 deletions tests/huge-msoffice.html

Large diffs are not rendered by default.

0 comments on commit e3bd6d4

Please sign in to comment.