Skip to content

Commit

Permalink
* New "charData" data file built from unicode.org containing whitespa…
Browse files Browse the repository at this point in the history
…ce & confusable chars

   confusable characters are now highlighted
* Refactor dumpers.. breakout object methods to new own classes.   New namespaces for base, text, & textAnsi
* drop Utf8Dump utility class
* array keys and object property names are now abstracted if contain non-utf8 characters
* PDO / mysqli  transaction methods no longer open / close a group..  now create an info logEntry
* trace() method - new limit param / params may be passed in any order
  • Loading branch information
bkdotcom committed May 15, 2024
1 parent a30f7b8 commit e3d7cd2
Show file tree
Hide file tree
Showing 132 changed files with 12,495 additions and 3,100 deletions.
4 changes: 4 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@
"test" : [
"vendor/bin/phpunit -v"
],
"update-char-data": [
"bdk\\Debug\\Dev\\UpdateCharData::update"
],
"update-phpcs": [
"bdk\\Debug\\Dev\\ComposerScripts::updatePhpCsXml"
],
Expand All @@ -102,6 +105,7 @@
"coverage": "analyze code coverage",
"cs": "check coding standards",
"test": "run unit tests",
"update-char-data": "rebuild confusable char latest unicode data",
"update-phpcs": "update phpcs.xml.dist & phpcs.slevomat.xml",
"webserver": "run php's internal webserver"
},
Expand Down
166 changes: 166 additions & 0 deletions dev/UpdateCharData.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
<?php

namespace bdk\Debug\Dev;

/**
* Pull latest confusables.txt from unicode and save to php file
*
* @psalm-import-type charInfo from \bdk\Debug\Plugin\CharHighlight
*/
class UpdateCharData
{
/** @var string */
public static $filepathSrc = 'https://www.unicode.org/Public/security/latest/confusables.txt';

/** @var array<string, charInfo> */
protected $charData = array();

/**
* Update confusableData.php
*
* @return void
*/
public static function update()
{
$filepathOut = __DIR__ . '/../src/Debug/Dump/charData.php';
$comment = '/**
* This file is generated automatically from confusables.txt
* https://www.unicode.org/Public/security/latest/confusables.txt
*
* `composer run update-char-data`
*
* @phpcs:disable SlevomatCodingStandard.Arrays.AlphabeticallySortedByKeys
*/';
$php = '<?php // phpcs:ignore SlevomatCodingStandard.Files.FileLength' . "\n\n"
. \preg_replace('/^[ ]{12}/m', ' ', $comment) . "\n\n"
. 'return ' . self::varExportPretty(self::build()) . ";\n";
$php = \preg_replace_callback('/[\'"](.)[\'"] => /u', static function ($matches) {
$char = $matches[1];
$codePoint = \mb_ord($char);
return $codePoint < 0x80
? '"\\x' . \dechex($codePoint) . '" => '
: '\'' . $char . '\' => ';
}, $php);
\file_put_contents($filepathOut, $php);
}

/**
* Build char data
*
* @return array<string, array<string, string|bool>>
*/
public static function build()
{
$rows = self::getParsedRows();

// only interested in chars that are confusable with an ascii char
// not interested in ascii chars that are confusable with other ascii chars
$rows = \array_filter($rows, static function ($row) {
$isCharAAscii = \strlen($row['charA']) === 1 && \ord($row['charA']) < 0x80;
$isCharBAscii = \strlen($row['charB']) === 1 && \ord($row['charB']) < 0x80;
return $isCharAAscii === false && $isCharBAscii;
});

\usort($rows, static function ($rowA, $rowB) {
return \strcmp($rowA['charA'], $rowB['charA']);
});

// rekey
$rowsNew = require __DIR__ . '/charData.php';
foreach ($rows as $row) {
$key = $row['charA'];
if (isset($rowsNew[$key])) {
continue;
}
unset($row['charA']);
$rowsNew[$key] = array(
'codePoint' => $row['charACodePoint'],
'desc' => $row['charADesc'],
'similarTo' => $row['charB'],
);
}

\ksort($rowsNew);

return $rowsNew;
}

/**
* Return parsed data for all confusable data
*
* @return array<string, string|bool>[]
*/
private static function getParsedRows()
{
$rows = \file(self::$filepathSrc);
$rows = \array_filter($rows, static function ($row) {
$isEmptyOrComment = \strlen(\trim($row)) === 0 || $row[0] === '#';
return $isEmptyOrComment === false;
});

return \array_map(static function ($row) {
return self::parseRow($row);
}, $rows);
}

/**
* Parse confusable.txt row
*
* @param string $row non-comment row from data file
*
* @return array<string,mixed>
*/
protected static function parseRow($row)
{
$parts = \explode('; ', $row, 3);
$parts = \array_map('trim', $parts);
$parts = \array_combine(array('charACodePoint', 'charBCodePoint', 'comment'), $parts);

$parts['charACodePoint'] = \implode(' ', \array_map(static function ($codePoint) {
// remove leading 00 pairs
return \preg_replace('/^(00)+/', '', $codePoint);
}, \explode(' ', $parts['charACodePoint'])));

$parts['charBCodePoint'] = \implode(' ', \array_map(static function ($codePoint) {
// remove leading 00 pairs
return \preg_replace('/^(00)+/', '', $codePoint);
}, \explode(' ', $parts['charBCodePoint'])));

\preg_match('/^(?P<category>\w+)\t#(?P<notXid>\*?)\s*(?P<example>\(.*?\))\s*(?P<charADesc>.*?) → (?P<charBDesc>.*?)(\s+#.*)?$/u', $parts['comment'], $matches);
$parts = \array_merge($parts, $matches);

return array(
'charA' => \implode('', \array_map(static function ($hex) {
$codePoint = \hexdec($hex);
return \mb_chr($codePoint, 'UTF-8');
}, \explode(' ', $parts['charACodePoint']))),
'charACodePoint' => $parts['charACodePoint'],
'charADesc' => $parts['charADesc'],

'charB' => \implode('', \array_map(static function ($hex) {
$codePoint = \hexdec($hex);
return \mb_chr($codePoint, 'UTF-8');
}, \explode(' ', $parts['charBCodePoint']))),
'isXid' => empty($parts['notXid']),
);
}

/**
* export value as valid php
*
* @param mixed $val Value to export
*
* @return string
*/
protected static function varExportPretty($val)
{
$php = \var_export($val, true);
$php = \str_replace('array (', 'array(', $php);
$php = \preg_replace('/=> \n\s+array/', '=> array', $php);
$php = \preg_replace_callback('/^(\s*)/m', static function ($matches) {
return \str_repeat($matches[1], 2);
}, $php);
$php = \str_replace('\'\' . "\0" . \'\'', '"\x00"', $php);
return $php;
}
}

0 comments on commit e3d7cd2

Please sign in to comment.