Skip to content

Commit

Permalink
Issue #77: Performance Fixes (#81)
Browse files Browse the repository at this point in the history
* Updated the performance fixture

* Added MbStringUtil as a wrapper around string functions to use mb_* functions only when necessary

* Added mutlibyte test

* Cleanup: Cleanup using Tidy extension code path was disabled a long time ago, pretty sure that it can be removed

* Removed the overhead of checking using strlen(), using strict string compare is about 30% faster

* Added strict types and reduced the call graph, increased the performance by about 3%
  • Loading branch information
SavageTiger authored and jschroed91 committed Feb 20, 2019
1 parent 6f39bc3 commit 2ba271b
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 77 deletions.
48 changes: 12 additions & 36 deletions lib/Caxy/HtmlDiff/AbstractDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

namespace Caxy\HtmlDiff;

use Caxy\HtmlDiff\Util\MbStringUtil;

/**
* Class AbstractDiff.
*/
Expand Down Expand Up @@ -79,6 +81,11 @@ abstract class AbstractDiff
*/
protected $resetCache = false;

/**
* @var MbStringUtil
*/
protected $stringUtil;

/**
* AbstractDiff constructor.
*
Expand All @@ -90,7 +97,7 @@ abstract class AbstractDiff
*/
public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
{
mb_substitute_character(0x20);
$this->stringUtil = new MbStringUtil($oldText, $newText);

$this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding));

Expand Down Expand Up @@ -389,44 +396,13 @@ protected function getClosingTag($tag)
return '</'.$tag.'>';
}

/**
* @param string $str
* @param string $start
* @param string $end
*
* @return string
*/
protected function getStringBetween($str, $start, $end)
{
$expStr = mb_split($start, $str, 2);
if (count($expStr) > 1) {
$expStr = mb_split($end, $expStr[ 1 ]);
if (count($expStr) > 1) {
array_pop($expStr);

return implode($end, $expStr);
}
}

return '';
}

/**
* @param string $html
*
* @return string
*/
protected function purifyHtml($html)
{
if (class_exists('Tidy') && false) {
$config = array('output-xhtml' => true, 'indent' => false);
$tidy = new tidy();
$tidy->parseString($html, $config, 'utf8');
$html = (string) $tidy;

return $this->getStringBetween($html, '<body>');
}

return $this->purifier->purify($html);
}

Expand Down Expand Up @@ -493,7 +469,7 @@ protected function convertHtmlToListOfWords($characterString)
$mode = 'whitespace';
} else {
if (
(($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) ||
(($this->ctypeAlphanumUnicode($character) === true) && ($this->stringUtil->strlen($current_word) === 0 || $this->isPartOfWord($current_word))) ||
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
) {
$current_word .= $character;
Expand Down Expand Up @@ -554,7 +530,7 @@ protected function convertHtmlToListOfWords($characterString)
*/
protected function isStartOfTag($val)
{
return $val == '<';
return $val === '<';
}

/**
Expand All @@ -564,7 +540,7 @@ protected function isStartOfTag($val)
*/
protected function isEndOfTag($val)
{
return $val == '>';
return $val === '>';
}

/**
Expand Down Expand Up @@ -595,6 +571,6 @@ protected function explode($value)
*/
protected function ctypeAlphanumUnicode($str)
{
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str);
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str) === 1;
}
}
25 changes: 13 additions & 12 deletions lib/Caxy/HtmlDiff/HtmlDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ protected function createIsolatedDiffTagPlaceholders(&$words)
foreach ($words as $index => $word) {
$openIsolatedDiffTag = $this->isOpeningIsolatedDiffTag($word, $currentIsolatedDiffTag);
if ($openIsolatedDiffTag) {
if ($this->isSelfClosingTag($word) || mb_stripos($word, '<img') !== false) {
if ($this->isSelfClosingTag($word) || $this->stringUtil->stripos($word, '<img') !== false) {
if ($openIsolatedDiffTags === 0) {
$isolatedDiffTagIndices[] = array(
'start' => $index,
Expand Down Expand Up @@ -543,7 +543,7 @@ protected function insertTag($tag, $cssClass, &$words)
$specialCaseTagInjection = '';
$specialCaseTagInjectionIsBefore = false;

if (count($nonTags) != 0) {
if (count($nonTags) !== 0) {
$text = $this->wrapText(implode('', $nonTags), $tag, $cssClass);
$this->content .= $text;
} else {
Expand All @@ -567,15 +567,15 @@ protected function insertTag($tag, $cssClass, &$words)
}
}
}
if (count($words) == 0 && mb_strlen($specialCaseTagInjection) == 0) {
if (count($words) == 0 && $this->stringUtil->strlen($specialCaseTagInjection) == 0) {
break;
}
if ($specialCaseTagInjectionIsBefore) {
$this->content .= $specialCaseTagInjection.implode('', $this->extractConsecutiveWords($words, 'tag'));
} else {
$workTag = $this->extractConsecutiveWords($words, 'tag');
if (isset($workTag[ 0 ]) && $this->isOpeningTag($workTag[ 0 ]) && !$this->isClosingTag($workTag[ 0 ])) {
if (mb_strpos($workTag[ 0 ], 'class=')) {
if ($this->stringUtil->strpos($workTag[ 0 ], 'class=')) {
$workTag[ 0 ] = str_replace('class="', 'class="diffmod ', $workTag[ 0 ]);
$workTag[ 0 ] = str_replace("class='", 'class="diffmod ', $workTag[ 0 ]);
} else {
Expand All @@ -584,7 +584,7 @@ protected function insertTag($tag, $cssClass, &$words)
}

$appendContent = implode('', $workTag).$specialCaseTagInjection;
if (isset($workTag[0]) && false !== mb_stripos($workTag[0], '<img')) {
if (isset($workTag[0]) && false !== $this->stringUtil->stripos($workTag[0], '<img')) {
$appendContent = $this->wrapText($appendContent, $tag, $cssClass);
}
$this->content .= $appendContent;
Expand Down Expand Up @@ -698,7 +698,7 @@ protected function operations()
$matches = $this->matchingBlocks();
$matches[] = new Match(count($this->oldWords), count($this->newWords), 0);

foreach ($matches as $i => $match) {
foreach ($matches as $match) {
$matchStartsAtCurrentPositionInOld = ($positionInOld === $match->startInOld);
$matchStartsAtCurrentPositionInNew = ($positionInNew === $match->startInNew);

Expand Down Expand Up @@ -769,10 +769,10 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI
*/
protected function stripTagAttributes($word)
{
$space = mb_strpos($word, ' ', 1);
$space = $this->stringUtil->strpos($word, ' ', 1);

if ($space) {
return '<' . mb_substr($word, 1, $space) . '>';
return '<' . $this->stringUtil->substr($word, 1, $space) . '>';
}

return trim($word, '<>');
Expand All @@ -788,6 +788,7 @@ protected function stripTagAttributes($word)
*/
protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
{
$groupDiffs = $this->isGroupDiffs();
$bestMatchInOld = $startInOld;
$bestMatchInNew = $startInNew;
$bestMatchSize = 0;
Expand Down Expand Up @@ -816,7 +817,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)

if ($newMatchLength > $bestMatchSize ||
(
$this->isGroupDiffs() &&
$groupDiffs &&
$bestMatchSize > 0 &&
$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
Expand All @@ -830,9 +831,9 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
}

// Skip match if none found or match consists only of whitespace
if ($bestMatchSize != 0 &&
if ($bestMatchSize !== 0 &&
(
!$this->isGroupDiffs() ||
!$groupDiffs ||
!$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
)
) {
Expand All @@ -850,7 +851,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
protected function isOnlyWhitespace($str)
{
// Slightly faster then using preg_match
return $str !== '' && (mb_strlen(trim($str)) === 0);
return $str !== '' && trim($str) === '';
}

/**
Expand Down
12 changes: 6 additions & 6 deletions lib/Caxy/HtmlDiff/ListDiff.php
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ protected function buildDiffList($words)
$list[] = $word;
}
} else {
$listType = mb_substr($word, 1, 2);
$listType = $this->stringUtil->substr($word, 1, 2);
$listStartTag = $word;
}

Expand All @@ -254,7 +254,7 @@ protected function buildDiffList($words)
if ($openListItems === 0) {
// New top-level list item
$currentListItem = array();
$listItemType = mb_substr($word, 1, 2);
$listItemType = $this->stringUtil->substr($word, 1, 2);
$listItemStart = $word;
} else {
$currentListItem[] = $word;
Expand Down Expand Up @@ -290,27 +290,27 @@ protected function isOpeningListTag($word, $type = null)
{
$filter = $type !== null ? array('<'.$type) : array('<ul', '<ol', '<dl');

return in_array(mb_substr($word, 0, 3), $filter);
return in_array($this->stringUtil->substr($word, 0, 3), $filter);
}

protected function isClosingListTag($word, $type = null)
{
$filter = $type !== null ? array('</'.$type) : array('</ul', '</ol', '</dl');

return in_array(mb_substr($word, 0, 4), $filter);
return in_array($this->stringUtil->substr($word, 0, 4), $filter);
}

protected function isOpeningListItemTag($word, $type = null)
{
$filter = $type !== null ? array('<'.$type) : array('<li', '<dd', '<dt');

return in_array(mb_substr($word, 0, 3), $filter);
return in_array($this->stringUtil->substr($word, 0, 3), $filter);
}

protected function isClosingListItemTag($word, $type = null)
{
$filter = $type !== null ? array('</'.$type) : array('</li', '</dd', '</dt');

return in_array(mb_substr($word, 0, 4), $filter);
return in_array($this->stringUtil->substr($word, 0, 4), $filter);
}
}
2 changes: 1 addition & 1 deletion lib/Caxy/HtmlDiff/ListDiffLines.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public function build()
return $this->content;
}

$matchStrategy = new ListItemMatchStrategy($this->config->getMatchThreshold());
$matchStrategy = new ListItemMatchStrategy($this->stringUtil, $this->config->getMatchThreshold());
$this->lcsService = new LcsService($matchStrategy);

return $this->listByLines($this->oldText, $this->newText);
Expand Down
20 changes: 10 additions & 10 deletions lib/Caxy/HtmlDiff/Preprocessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@

class Preprocessor
{
public static function diffCommonPrefix($old, $new)
public static function diffCommonPrefix($old, $new, $stringUtil)
{
// Quick check for common null cases.
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, 0, 1) != mb_substr($new, 0, 1)) {
if ($stringUtil->strlen($old) == 0 || $stringUtil->strlen($new) == 0 || $stringUtil->substr($old, 0, 1) != $stringUtil->substr($new, 0, 1)) {
return 0;
}

// Binary Search
$pointerMin = 0;
$pointerMax = min(mb_strlen($old), mb_strlen($new));
$pointerMax = min($stringUtil->strlen($old), $stringUtil->strlen($new));
$pointerMid = $pointerMax;
$pointerStart = 0;
while ($pointerMin < $pointerMid) {
$cmp = substr_compare(
$old,
mb_substr($new, $pointerStart, $pointerMid - $pointerStart),
$stringUtil->substr($new, $pointerStart, $pointerMid - $pointerStart),
$pointerStart,
$pointerMid - $pointerStart
);
Expand All @@ -34,22 +34,22 @@ public static function diffCommonPrefix($old, $new)
return $pointerMid;
}

public static function diffCommonSuffix($old, $new)
public static function diffCommonSuffix($old, $new, $stringUtil)
{
// Quick check for common null cases.
if (mb_strlen($old) == 0 || mb_strlen($new) == 0 || mb_substr($old, mb_strlen($old) - 1, 1) != mb_substr($new, mb_strlen($new) - 1, 1)) {
if ($stringUtil->strlen($old) == 0 || $stringUtil->strlen($new) == 0 || $stringUtil->substr($old, $stringUtil->strlen($old) - 1, 1) != $stringUtil->substr($new, $stringUtil->strlen($new) - 1, 1)) {
return 0;
}

// Binary Search
$pointerMin = 0;
$pointerMax = min(mb_strlen($old), mb_strlen($new));
$pointerMax = min($stringUtil->strlen($old), $stringUtil->strlen($new));
$pointerMid = $pointerMax;
$pointerEnd = 0;
$oldLen = mb_strlen($old);
$newLen = mb_strlen($new);
$oldLen = $stringUtil->strlen($old);
$newLen = $stringUtil->strlen($new);
while ($pointerMin < $pointerMid) {
if (mb_substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == mb_substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
if ($stringUtil->substr($old, $oldLen - $pointerMid, $pointerMid - $pointerEnd) == $stringUtil->substr($new, $newLen - $pointerMid, $pointerMid - $pointerEnd)) {
$pointerMin = $pointerMid;
$pointerEnd = $pointerMin;
} else {
Expand Down
28 changes: 18 additions & 10 deletions lib/Caxy/HtmlDiff/Strategy/ListItemMatchStrategy.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@
namespace Caxy\HtmlDiff\Strategy;

use Caxy\HtmlDiff\Preprocessor;
use Caxy\HtmlDiff\Util\MbStringUtil;

class ListItemMatchStrategy implements MatchStrategyInterface
{
/**
* @var MbStringUtil
*/
protected $stringUtil;

/**
* @var int
*/
Expand All @@ -24,12 +30,14 @@ class ListItemMatchStrategy implements MatchStrategyInterface
/**
* ListItemMatchStrategy constructor.
*
* @param int $similarityThreshold
* @param float $lengthRatioThreshold
* @param float $commonTextRatioThreshold
* @param MbStringUtil $stringUtil
* @param int $similarityThreshold
* @param float $lengthRatioThreshold
* @param float $commonTextRatioThreshold
*/
public function __construct($similarityThreshold = 80, $lengthRatioThreshold = 0.1, $commonTextRatioThreshold = 0.6)
public function __construct($stringUtil, $similarityThreshold = 80, $lengthRatioThreshold = 0.1, $commonTextRatioThreshold = 0.6)
{
$this->stringUtil = $stringUtil;
$this->similarityThreshold = $similarityThreshold;
$this->lengthRatioThreshold = $lengthRatioThreshold;
$this->commonTextRatioThreshold = $commonTextRatioThreshold;
Expand Down Expand Up @@ -63,20 +71,20 @@ public function isMatch($a, $b)
// Check common prefix/ suffix length
$aCleaned = trim($aStripped);
$bCleaned = trim($bStripped);
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
if ($this->stringUtil->strlen($aCleaned) === 0 || $this->stringUtil->strlen($bCleaned) === 0) {
$aCleaned = $a;
$bCleaned = $b;
}
if (mb_strlen($aCleaned) === 0 || mb_strlen($bCleaned) === 0) {
if ($this->stringUtil->strlen($aCleaned) === 0 || $this->stringUtil->strlen($bCleaned) === 0) {
return false;
}
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned);
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned);
$prefixIndex = Preprocessor::diffCommonPrefix($aCleaned, $bCleaned, $this->stringUtil);
$suffixIndex = Preprocessor::diffCommonSuffix($aCleaned, $bCleaned, $this->stringUtil);

// Use shorter string, and see how much of it is leftover
$len = min(mb_strlen($aCleaned), mb_strlen($bCleaned));
$len = min($this->stringUtil->strlen($aCleaned), $this->stringUtil->strlen($bCleaned));
$remaining = $len - ($prefixIndex + $suffixIndex);
$strLengthPercent = $len / max(mb_strlen($a), mb_strlen($b));
$strLengthPercent = $len / max($this->stringUtil->strlen($a), $this->stringUtil->strlen($b));

if ($remaining === 0 && $strLengthPercent > $this->lengthRatioThreshold) {
return true;
Expand Down
Loading

0 comments on commit 2ba271b

Please sign in to comment.