diff --git a/src/SmartFormat.Tests/Core/CharSetTests.cs b/src/SmartFormat.Tests/Core/CharSetTests.cs new file mode 100644 index 00000000..968b446a --- /dev/null +++ b/src/SmartFormat.Tests/Core/CharSetTests.cs @@ -0,0 +1,54 @@ +using System; +using System.Linq; +using NUnit.Framework; +using SmartFormat.Core.Parsing; + +namespace SmartFormat.Tests.Core; + +[TestFixture] +internal class CharSetTests +{ + [Test] + public void CharSet_Add_Remove() + { + char[] asciiChars = ['A', 'B', 'C']; + char[] nonAsciiChars = ['Ā', 'Б', '中']; + var charSet = new CharSet(); + charSet.AddRange(asciiChars.AsEnumerable()); + charSet.AddRange(nonAsciiChars.AsSpan()); + var countBeforeRemoval = charSet.Count; + var existingRemoved = charSet.Remove('C'); + charSet.Remove('中'); + // trying to remove a not existing char returns false + var nonExistingRemoved = charSet.Remove('?'); + var count = charSet.Count; + + Assert.Multiple(() => + { + Assert.That(countBeforeRemoval, Is.EqualTo(asciiChars.Length + nonAsciiChars.Length)); + Assert.That(count, Is.EqualTo(countBeforeRemoval - 2)); + Assert.That(existingRemoved, Is.True); + Assert.That(nonExistingRemoved, Is.False); + }); + } + + [Test] + public void CharSet_CreateFromSpan_GetCharacters_Contains() + { + char[] asciiAndNonAscii = ['\0', 'A', 'B', 'C', 'Ā', 'Б', '中']; + var charSet = new CharSet(asciiAndNonAscii.AsSpan()); + + Assert.Multiple(() => + { + Assert.That(charSet, Has.Count.EqualTo(7)); + Assert.That(charSet.Contains('A'), Is.True); // ASCII + Assert.That(charSet.Contains('\0'), Is.True); // control character + Assert.That(charSet.Contains('中'), Is.True); // non-ASCII + Assert.That(charSet.Contains('?'), Is.False); + Assert.That(charSet.GetCharacters(), Is.EquivalentTo(asciiAndNonAscii)); + charSet.Clear(); + Assert.That(charSet, Has.Count.EqualTo(0)); + Assert.That(charSet.GetCharacters(), Is.Empty); + }); + } +} diff --git a/src/SmartFormat.Tests/Core/ParserTests.cs b/src/SmartFormat.Tests/Core/ParserTests.cs index f1d4803d..dc6780bc 100644 --- a/src/SmartFormat.Tests/Core/ParserTests.cs +++ b/src/SmartFormat.Tests/Core/ParserTests.cs @@ -1,11 +1,12 @@ -using NUnit.Framework; -using SmartFormat.Core.Parsing; -using SmartFormat.Core.Settings; -using SmartFormat.Tests.TestUtils; -using System; +using System; +using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; +using NUnit.Framework; +using SmartFormat.Core.Parsing; +using SmartFormat.Core.Settings; +using SmartFormat.Tests.TestUtils; namespace SmartFormat.Tests.Core; @@ -66,9 +67,9 @@ public void Parser_Throws_Exceptions(string format) Assert.Throws(() => formatter.Test(format, args, "Error")); } - [TestCase("{V(LU)}")] // braces are illegal - [TestCase("{V LU }")] // blanks are illegal - [TestCase("{VĀLUĒ}")] // 0x100 and 0x112 are illegal chars + [TestCase("{V(LU)}")] // braces are not allowed + [TestCase("{V LU\\}")] // escape char is not allowed + [TestCase("{V?LU,}")] // ? and , are allowed chars public void Parser_Throws_On_Illegal_Selector_Chars(string format) { var parser = GetRegularParser(); @@ -81,9 +82,9 @@ public void Parser_Throws_On_Illegal_Selector_Chars(string format) { Assert.Multiple(() => { - // Throws, because selector contains 2 illegal characters + // Throws, because selector contains disallowed characters Assert.That(e, Is.InstanceOf()); - Assert.That(((ParsingErrors) e).Issues, Has.Count.EqualTo(2)); + Assert.That(((ParsingErrors) e).Issues, Has.Count.GreaterThanOrEqualTo(1)); }); } } @@ -154,6 +155,7 @@ public void Parser_Error_Action_Ignore() // | Literal | Erroneous | | Okay | var invalidTemplate = "Hello, I'm {Name from {City} {Street}"; + // settings must be set before parser instantiation var parser = GetRegularParser(new SmartSettings {Parser = new ParserSettings {ErrorAction = ParseErrorAction.Ignore}}); using var parsed = parser.ParseFormat(invalidTemplate); @@ -176,6 +178,7 @@ public void Parser_Error_Action_Ignore() [TestCase("Hello, I'm {Name from {City} {Street", false)] public void Parser_Error_Action_MaintainTokens(string invalidTemplate, bool lastItemIsPlaceholder) { + // settings must be set before parser instantiation var parser = GetRegularParser(new SmartSettings {Parser = new ParserSettings {ErrorAction = ParseErrorAction.MaintainTokens}}); using var parsed = parser.ParseFormat(invalidTemplate); @@ -203,8 +206,16 @@ public void Parser_Error_Action_OutputErrorInResult() { // | Literal | Erroneous | var invalidTemplate = "Hello, I'm {Name from {City}"; - - var parser = GetRegularParser(new SmartSettings {Parser = new ParserSettings {ErrorAction = ParseErrorAction.OutputErrorInResult}}); + + var parser = GetRegularParser(new SmartSettings + { + Parser = new ParserSettings + { + SelectorCharFilter = SelectorFilterType.Alphanumeric, // default + ErrorAction = ParseErrorAction.OutputErrorInResult + } + }); + using var parsed = parser.ParseFormat(invalidTemplate); Assert.That(parsed.Items, Has.Count.EqualTo(1)); @@ -412,11 +423,11 @@ public void Parser_NotifyParsingError() }); formatter.Parser.OnParsingFailure += (o, args) => parsingError = args.Errors; - var res = formatter.Format("{NoName {Other} {Same", default(object)!); + var res = formatter.Format("{NoName {Other} {Same"); Assert.Multiple(() => { Assert.That(parsingError!.Issues, Has.Count.EqualTo(3)); - Assert.That(parsingError.Issues[2].Issue, Is.EqualTo(new Parser.ParsingErrorText()[SmartFormat.Core.Parsing.Parser.ParsingError.MissingClosingBrace])); + Assert.That(parsingError.Issues[2].Issue, Is.EqualTo(new Parser.ParsingErrorText()[Parser.ParsingError.MissingClosingBrace])); }); } @@ -457,6 +468,18 @@ public void Escaping_TheEscapingCharacter_ShouldWork() Assert.That(result, Is.EqualTo(@"\\aaa\{}bbb ccc\x{}ddd\\")); } + [Test] + public void Parsing_Selector_With_CharFromBlocklist_ShouldThrow() + { + var settings = new SmartSettings { Parser = new ParserSettings { SelectorCharFilter = SelectorFilterType.VisualUnicodeChars } }; + var parser = GetRegularParser(settings); + + // The newline character is in the default blocklist of disallowed characters + Assert.That(() => parser.ParseFormat("{A\nB}"), + Throws.Exception.InstanceOf().And.Message + .Contains(new Parser.ParsingErrorText()[Parser.ParsingError.InvalidCharactersInSelector])); + } + [Test] public void StringFormat_Escaping_In_Literal() { @@ -534,8 +557,10 @@ public void Parse_Unicode(string formatString, string unicodeLiteral, int itemIn [TestCase("{%C}", '%')] public void Selector_With_Custom_Selector_Character(string formatString, char customChar) { + // settings must be set before parser instantiation var settings = new SmartSettings(); - settings.Parser.AddCustomSelectorChars(new[]{customChar}); + settings.Parser.AddCustomSelectorChars([customChar]); + var x = settings.Parser.GetSelectorChars(); var parser = GetRegularParser(settings); var result = parser.ParseFormat(formatString); @@ -544,7 +569,7 @@ public void Selector_With_Custom_Selector_Character(string formatString, char cu Assert.That(placeholder!.Selectors, Has.Count.EqualTo(1)); Assert.Multiple(() => { - Assert.That(placeholder!.Selectors, Has.Count.EqualTo(placeholder!.GetSelectors().Count)); + Assert.That(placeholder.Selectors, Has.Count.EqualTo(placeholder.GetSelectors().Count)); Assert.That(placeholder.Selectors[0].ToString(), Is.EqualTo(formatString.Substring(1, 2))); }); } @@ -553,8 +578,10 @@ public void Selector_With_Custom_Selector_Character(string formatString, char cu [TestCase("{a°b}", '°')] public void Selectors_With_Custom_Operator_Character(string formatString, char customChar) { - var parser = GetRegularParser(); - parser.Settings.Parser.AddCustomOperatorChars(new[]{customChar}); + // settings must be set before parser instantiation + var settings = new SmartSettings(); + settings.Parser.AddCustomOperatorChars([customChar]); + var parser = GetRegularParser(settings); var result = parser.ParseFormat(formatString); var placeholder = result.Items[0] as Placeholder; @@ -568,6 +595,31 @@ public void Selectors_With_Custom_Operator_Character(string formatString, char c }); } + [TestCase("German |öäüßÖÄÜ!")] + [TestCase("Russian абвгдеёжзийклмн")] + [TestCase("French >éèêëçàùâîô")] + [TestCase("Spanish <áéíóúñü¡¿")] + [TestCase("Portuguese !ãõáâêéíóúç")] + [TestCase("Chinese 汉字测试")] + [TestCase("Arabic مرحبا بالعالم")] + [TestCase("Turkish çğöşüİı")] + [TestCase("Hindi नमस्ते दुनिया")] + public void Selector_WorksWithAllUnicodeChars(string selector) + { + // See https://github.com/axuno/SmartFormat/issues/454 + + // settings must be set before parser instantiation + var settings = new SmartSettings { Parser = { SelectorCharFilter = SelectorFilterType.VisualUnicodeChars } }; + const string expected = "The Value"; + // The default formatter with default settings should be able to handle any + // Unicode characters in selectors except the "magic" disallowed ones + var formatter = Smart.CreateDefaultSmartFormat(settings); + // Use the Unicode string as a selector of the placeholder + var template = $"{{{selector}}}"; + var result = formatter.Format(template, new Dictionary { { selector, expected } }); + Assert.That(result, Is.EqualTo(expected)); + } + [TestCase("{A?.B}")] [TestCase("{Selector0?.Selector1}")] [TestCase("{A?[1].B}")] @@ -622,10 +674,11 @@ public void Selector_With_Nullable_Operator_Character(string formatString) public void Selector_With_Other_Contiguous_Operator_Characters(string formatString, char customChar) { // contiguous operator characters are parsed as "ONE operator string" - - var parser = GetRegularParser(); + var settings = new SmartSettings(); + settings.Parser.AddCustomOperatorChars([customChar]); + var parser = GetRegularParser(settings); // adding '.' is ignored, as it's a standard operator - parser.Settings.Parser.AddCustomOperatorChars(new[]{customChar}); + parser.Settings.Parser.AddCustomOperatorChars([customChar]); var result = parser.ParseFormat(formatString); var placeholder = result.Items[0] as Placeholder; @@ -681,6 +734,41 @@ public void ParseInputAsHtml(string input) Assert.That(literalText!.RawText, Is.EqualTo(input)); } + #region * Parse HTML input without ParserSetting 'IsHtml' + + /// + /// is : + /// all characters are allowed in selectors + /// + [TestCase("", "{Placeholder}")] + [TestCase("", "{Placeholder}")] + [TestCase("Something ! nice", "{ color : #000; }")] + [TestCase("Something ';}! nice", "{const a = '';}")] + public void ParseHtmlInput_Without_ParserSetting_IsHtml(string input, string selector) + { + var parser = GetRegularParser(new SmartSettings + { + StringFormatCompatibility = false, + Parser = new ParserSettings + { + SelectorCharFilter = SelectorFilterType.VisualUnicodeChars, + ErrorAction = ParseErrorAction.ThrowError, + ParseInputAsHtml = false + } + }); + + var result = parser.ParseFormat(input); + Assert.Multiple(() => + { + Assert.That(result.Items, Has.Count.EqualTo(3)); + Assert.That(((Placeholder) result.Items[1]).RawText, Is.EqualTo(selector)); + }); + } + + /// + /// is : + /// Predefined set of allowed characters in selectors + /// [TestCase("", false)] // should parse a placeholder [TestCase("", false)] // should parse a placeholder [TestCase("Something ! nice", true)] // illegal selector chars @@ -690,7 +778,12 @@ public void ParseHtmlInput_Without_ParserSetting_IsHtml(string input, bool shoul var parser = GetRegularParser(new SmartSettings { StringFormatCompatibility = false, - Parser = new ParserSettings { ErrorAction = ParseErrorAction.ThrowError, ParseInputAsHtml = false } + Parser = new ParserSettings + { + SelectorCharFilter = SelectorFilterType.Alphanumeric, + ErrorAction = ParseErrorAction.ThrowError, + ParseInputAsHtml = false + } }); switch (shouldThrow) @@ -707,6 +800,8 @@ public void ParseHtmlInput_Without_ParserSetting_IsHtml(string input, bool shoul } } + #endregion + /// /// SmartFormat is able to parse script tags, if is /// @@ -807,29 +902,31 @@ function interpolationSearch(sortedArray, seekIndex) { [TestCase(true, false)] public void StyleTags_Can_Be_Parsed_Without_Failure(bool inputIsHtml, bool shouldFail) { - var styles = @" - -

############### {TheVariable} ###############

-"; + .comment img { + border: 1px solid grey; + anything: 'xyz' + } + + .list-item { + border-bottom: 1px solid grey; + } + /* Comment: { which mixes up the parser without ParserSettings.ParseInputAsHtml = true */ + +

############### {TheVariable} ###############

+ + """; var parsingFailures = 0; var parser = GetRegularParser(new SmartSettings { diff --git a/src/SmartFormat.Tests/Core/SettingsTests.cs b/src/SmartFormat.Tests/Core/SettingsTests.cs index 74e93c1f..38140348 100644 --- a/src/SmartFormat.Tests/Core/SettingsTests.cs +++ b/src/SmartFormat.Tests/Core/SettingsTests.cs @@ -11,20 +11,38 @@ public class SettingsTests public void TryingToAddDisallowedSelectorCharacters_Should_Throw() { var settings = new SmartSettings(); - Assert.That(() => settings.Parser.AddCustomSelectorChars(new[] {settings.Parser.PlaceholderBeginChar}), - Throws.ArgumentException.And.Message.Contains($"{settings.Parser.PlaceholderBeginChar}")); + Assert.That(() => settings.Parser.AddCustomSelectorChars([ParserSettings.PlaceholderBeginChar]), + Throws.ArgumentException.And.Message.Contains($"{ParserSettings.PlaceholderBeginChar}")); } [Test] public void ExistingSelectorCharacter_Should_Not_Be_Added() { var settings = new SmartSettings(); - settings.Parser.AddCustomSelectorChars(new[] {'A', ' '}); - settings.Parser.AddCustomSelectorChars(new[] {' '}); + settings.Parser.AddCustomSelectorChars(['A', ' ']); Assert.Multiple(() => { - Assert.That(settings.Parser.CustomSelectorChars().Count(c => c == 'A'), Is.EqualTo(0)); - Assert.That(settings.Parser.CustomSelectorChars().Count(c => c == ' '), Is.EqualTo(1)); + Assert.That(settings.Parser.CustomSelectorChars.Count(c => c == 'A'), Is.EqualTo(0)); + Assert.That(settings.Parser.CustomSelectorChars.Count(c => c == ' '), Is.EqualTo(1)); + }); + } + + [TestCase(SelectorFilterType.Alphanumeric)] + [TestCase(SelectorFilterType.VisualUnicodeChars)] + public void NonVisualCharacters_Should_Be_AddedBack_As_SelectorChars(SelectorFilterType filterType) + { + var settings = new SmartSettings { Parser = { SelectorCharFilter = filterType } }; + var nonVisualChars = ParserSettings.NonVisualUnicodeCharacters; + settings.Parser.AddCustomSelectorChars(nonVisualChars); + + Assert.Multiple(() => + { + Assert.That(settings.Parser.CustomSelectorChars, Has.Count.EqualTo(nonVisualChars.Length)); + foreach (var c in settings.Parser.CustomSelectorChars) + { + Assert.That(settings.Parser.GetSelectorChars(), filterType == SelectorFilterType.Alphanumeric ? Does.Contain(c) : Does.Not.Contain(c), + $"Character U+{(int) c:X4} should be allowed as selector char."); + } }); } @@ -32,45 +50,59 @@ public void ExistingSelectorCharacter_Should_Not_Be_Added() public void TryingToAddDisallowedOperatorCharacters_Should_Throw() { var settings = new SmartSettings(); - Assert.That(() => settings.Parser.AddCustomOperatorChars(new[] {settings.Parser.PlaceholderBeginChar}), - Throws.ArgumentException.And.Message.Contains($"{settings.Parser.PlaceholderBeginChar}")); + Assert.That(() => settings.Parser.AddCustomOperatorChars([ParserSettings.PlaceholderBeginChar]), + Throws.ArgumentException.And.Message.Contains($"{ParserSettings.PlaceholderBeginChar}")); } [Test] public void ExistingOperatorCharacter_Should_Not_Be_Added() { var settings = new SmartSettings(); - settings.Parser.AddCustomOperatorChars(new[] {settings.Parser.OperatorChars()[0], '°'}); - settings.Parser.AddCustomOperatorChars(new[] {'°'}); + settings.Parser.AddCustomOperatorChars([ParserSettings.OperatorChars[0], '°']); + settings.Parser.AddCustomOperatorChars(['°']); Assert.Multiple(() => { - Assert.That(settings.Parser.CustomOperatorChars().Count(c => c == settings.Parser.OperatorChars()[0]), Is.EqualTo(0)); - Assert.That(settings.Parser.CustomOperatorChars().Count(c => c == '°'), Is.EqualTo(1)); + Assert.That(settings.Parser.CustomOperatorChars.Count(c => c == ParserSettings.OperatorChars[0]), Is.EqualTo(0)); + Assert.That(settings.Parser.CustomOperatorChars.Count(c => c == '°'), Is.EqualTo(1)); }); } - [TestCase('°')] // a custom char - [TestCase('A')] // a standard selector char - public void Add_CustomOperator_Used_As_Separator_Should_Throw(char operatorChar) + [TestCase('{')] + [TestCase('}')] + [TestCase(':')] + [TestCase('(')] + [TestCase(')')] + public void Add_Separators_As_Custom_Operator_Should_Throw(char operatorChar) { var settings = new SmartSettings(); - settings.Parser.AddCustomSelectorChars(new[] {operatorChar}); // reserve as selector char // try to add the same char as operator - Assert.That(() => settings.Parser.AddCustomOperatorChars(new[] {operatorChar}), - Throws.ArgumentException.And.Message.Contains($"{operatorChar}")); + Assert.That(() => settings.Parser.AddCustomOperatorChars([operatorChar]), + Throws.ArgumentException.And.Message.Contains($"'{operatorChar}'")); } - [TestCase('°')] // a custom char + [TestCase('°')] // a custom selector char [TestCase('.')] // a standard operator char public void Add_CustomSelector_Used_As_Operator_Should_Throw(char selectorChar) { var settings = new SmartSettings(); - settings.Parser.AddCustomOperatorChars(new[] {selectorChar}); // reserve as operator char + settings.Parser.AddCustomOperatorChars([selectorChar]); // reserve as operator char // try to add the same char as selector - Assert.That(() => settings.Parser.AddCustomSelectorChars(new[] {selectorChar}), + Assert.That(() => settings.Parser.AddCustomSelectorChars([selectorChar]), Throws.ArgumentException.And.Message.Contains($"{selectorChar}")); } + + [TestCase((char) 127)] // a custom char + [TestCase((char) 30)] // a standard operator char + public void Add_CustomOperator_Used_As_Selector_Should_Throw(char operatorChar) + { + var settings = new SmartSettings(); + settings.Parser.AddCustomSelectorChars([operatorChar]); // reserve as operator char + + // try to add the same char as selector + Assert.That(() => settings.Parser.AddCustomOperatorChars([operatorChar]), + Throws.ArgumentException.And.Message.Contains($"{operatorChar}")); + } } diff --git a/src/SmartFormat/Core/Extensions/Source.cs b/src/SmartFormat/Core/Extensions/Source.cs index c8d06c65..d73f4682 100644 --- a/src/SmartFormat/Core/Extensions/Source.cs +++ b/src/SmartFormat/Core/Extensions/Source.cs @@ -49,7 +49,7 @@ private bool HasNullableOperator(ISelectorInfo selectorInfo) #pragma warning disable S3267 // Don't use LINQ in favor of less GC foreach (var s in selectorInfo.Placeholder.Selectors) { - if (s.OperatorLength > 1 && s.BaseString[s.OperatorStartIndex] == _smartSettings.Parser.NullableOperator) + if (s.OperatorLength > 1 && s.BaseString[s.OperatorStartIndex] == ParserSettings.NullableOperator) return true; } #pragma warning restore S3267 // Restore: Loops should be simplified with "LINQ" expressions diff --git a/src/SmartFormat/Core/Parsing/CharSet.cs b/src/SmartFormat/Core/Parsing/CharSet.cs new file mode 100644 index 00000000..a4804294 --- /dev/null +++ b/src/SmartFormat/Core/Parsing/CharSet.cs @@ -0,0 +1,216 @@ +// Copyright SmartFormat Project maintainers and contributors. +// Licensed under the MIT license. + +using System; +using System.Collections; +using System.Collections.Generic; + +namespace SmartFormat.Core.Parsing; + +/// +/// Represents a set of characters that supports efficient storage and lookup +/// for both ASCII and non-ASCII characters. +/// +/// +/// The class is optimized for handling ASCII characters using a bitmap +/// representation, while non-ASCII characters are stored in a separate collection. +/// +/// The class provides methods to add characters individually or in bulk, remove characters, check for containment, and enumerate all +/// characters in the set. ASCII characters are enumerated first in numerical order, followed by non-ASCII characters in +/// no guaranteed order. +/// +/// This class is not thread-safe. +/// +internal class CharSet : IEnumerable +{ + private const int ASCII_LIMIT = 128; + private const int BITS_PER_UINT = 32; + private const int BITMAP_LENGTH = ASCII_LIMIT / BITS_PER_UINT; + + private readonly uint[] _asciiBitmap = new uint[BITMAP_LENGTH]; + private readonly HashSet _nonAsciiChars = []; + + /// + /// Gets or sets a value indicating whether the list is + /// an allowlist (, default) or a blocklist (). + /// + public bool IsAllowList { get; set; } + + /// + /// Initializes a new instance of the class that is empty. + /// + public CharSet() + {} + + /// + /// Initializes a new instance of the class that contains the characters + /// from the specified read-only span. + /// + /// The read-only span containing characters to add to the set. + public CharSet(ReadOnlySpan characters) + { + AddRange(characters); + } + + /// + /// Initializes a new instance of the class that contains the characters + /// from the specified collection. + /// + /// The collection of characters to add to the set. + /// Thrown when is null. + public CharSet(IEnumerable characters) + { + AddRange(characters); + } + + /// + /// Adds all characters from the specified read-only span to the current set. + /// Only adds characters that aren't already present in the set. + /// + /// The read-only span containing characters to add. + public void AddRange(ReadOnlySpan characters) + { + foreach (var ch in characters) + Add(ch); + } + + /// + /// Adds all characters from the specified collection to the current set. + /// Only adds characters that aren't already present in the set. + /// + /// The collection of characters to add. + /// Thrown when is null. + public void AddRange(IEnumerable characters) + { + foreach (var ch in characters) + Add(ch); + } + + /// + /// Adds the specified character to the current set. + /// Only adds a character that isn't already present in the set. + /// + /// The character to add. + public void Add(char c) + { + if (c < ASCII_LIMIT) + _asciiBitmap[c / BITS_PER_UINT] |= 1u << c % BITS_PER_UINT; + else + _nonAsciiChars.Add(c); + } + + /// + /// Removes the specified character from the current set. + /// + /// The character to remove. + /// + /// if the character was successfully found and removed; + /// otherwise, . + /// + public bool Remove(char c) + { + if (c < ASCII_LIMIT) + { + ref var bitmap = ref _asciiBitmap[c / BITS_PER_UINT]; + var mask = 1u << c % BITS_PER_UINT; + + if ((bitmap & mask) == 0) return false; + + bitmap &= ~mask; + return true; + } + + return _nonAsciiChars.Remove(c); + } + + /// + /// Determines whether the current set contains the specified character. + /// + /// The character to locate in the set. + /// + /// if the set contains the specified character; otherwise, . + /// + public bool Contains(char c) + { + if (c < ASCII_LIMIT) + return (_asciiBitmap[c / BITS_PER_UINT] & 1u << c % BITS_PER_UINT) != 0; + + return _nonAsciiChars.Contains(c); + } + + /// + /// Removes all characters from the current set. + /// + public void Clear() + { + Array.Clear(_asciiBitmap, 0, _asciiBitmap.Length); + _nonAsciiChars.Clear(); + } + + /// + /// Gets the number of characters contained in the set. + /// + /// The number of characters in the set. + public int Count + { + get + { + var count = 0; + + // Count ASCII characters using bit population count + foreach (var segment in _asciiBitmap) + count += BitCount(segment); + + return count + _nonAsciiChars.Count; + } + } + + /// + /// Returns an enumerator that iterates through the characters in the set. + /// + /// An enumerator that can be used to iterate through the characters in the set. + /// + /// The enumeration returns ASCII characters first (in numerical order), followed by non-ASCII characters + /// (in no guaranteed order). + /// + public IEnumerable GetCharacters() + { + for (var i = 0; i < ASCII_LIMIT; i++) + if ((_asciiBitmap[i / BITS_PER_UINT] & 1u << i % BITS_PER_UINT) != 0) + yield return (char) i; + + foreach (var c in _nonAsciiChars) + yield return c; + } + + /// + /// Helper method to count set bits in an uint (Hamming weight) + /// + /// The unsigned integer value to count bits in. + /// The number of bits set to 1 in the specified value. + private static int BitCount(uint value) + { + // SWAR (SIMD Within A Register) technique for counting the number + // of set bits (1s) in a 32-bit unsigned integer. + + // Count bits in pairs. + // Subtracts each pair of bits from itself shifted right by one, masked to isolate alternating bits. + value -= value >> 1 & 0x55555555; + // Count bits in 4-bit groups. Adds adjacent 2-bit counts to form 4-bit counts. + value = (value & 0x33333333) + (value >> 2 & 0x33333333); + // Aggregate all 4-bit counts into a single total. + return (int) ((value + (value >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24; + } + + /// + public IEnumerator GetEnumerator() + { + foreach (var ch in GetCharacters()) yield return ch; + } + + /// + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } +} diff --git a/src/SmartFormat/Core/Parsing/Parser.cs b/src/SmartFormat/Core/Parsing/Parser.cs index 6830a2be..2360cf9f 100644 --- a/src/SmartFormat/Core/Parsing/Parser.cs +++ b/src/SmartFormat/Core/Parsing/Parser.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using SmartFormat.Core.Settings; using SmartFormat.Pooling.SmartPools; @@ -33,11 +34,11 @@ public class Parser public SmartSettings Settings { get; } // Cache method results from settings - private readonly List _operatorChars; - private readonly List _customOperatorChars; + private readonly CharSet _operatorChars; + private readonly CharSet _customOperatorChars; private readonly ParserSettings _parserSettings; - private readonly List _validSelectorChars; - private readonly List _formatOptionsTerminatorChars; + private readonly CharSet _selectorChars; + private readonly CharSet _formatOptionsTerminatorChars; #endregion @@ -63,14 +64,11 @@ public Parser(SmartSettings? smartSettings = null) { Settings = smartSettings ?? new SmartSettings(); _parserSettings = Settings.Parser; - _operatorChars = _parserSettings.OperatorChars(); - _customOperatorChars = _parserSettings.CustomOperatorChars(); - _formatOptionsTerminatorChars = _parserSettings.FormatOptionsTerminatorChars(); - - _validSelectorChars = new List(); - _validSelectorChars.AddRange(_parserSettings.SelectorChars()); - _validSelectorChars.AddRange(_parserSettings.OperatorChars()); - _validSelectorChars.AddRange(_parserSettings.CustomSelectorChars()); + _operatorChars = new CharSet(ParserSettings.OperatorChars.AsSpan()) ; + _customOperatorChars = new CharSet(_parserSettings.CustomOperatorChars); + _formatOptionsTerminatorChars = new CharSet(ParserSettings.FormatOptionsTerminatorChars.AsSpan()); + // Selector chars can be an allowlist or blocklist: + _selectorChars = _parserSettings.GetSelectorChars(); } #endregion @@ -81,6 +79,7 @@ public Parser(SmartSettings? smartSettings = null) /// Includes a-z and A-Z in the list of allowed selector chars. /// [Obsolete("Alphanumeric selectors are always enabled", true)] + [ExcludeFromCodeCoverage] public void AddAlphanumericSelectors() { // Do nothing - this is the standard behavior @@ -91,6 +90,7 @@ public void AddAlphanumericSelectors() /// /// [Obsolete("Use 'Settings.Parser.AddCustomSelectorChars' instead.", true)] + [ExcludeFromCodeCoverage] public void AddAdditionalSelectorChars(string chars) { _parserSettings.AddCustomSelectorChars(chars.ToCharArray()); @@ -103,6 +103,7 @@ public void AddAdditionalSelectorChars(string chars) /// /// [Obsolete("Use 'Settings.Parser.AddCustomOperatorChars' instead.", true)] + [ExcludeFromCodeCoverage] public void AddOperators(string chars) { _parserSettings.AddCustomOperatorChars(chars.ToCharArray()); @@ -115,6 +116,7 @@ public void AddOperators(string chars) /// /// Defaults to backslash [Obsolete("Use 'Settings.StringFormatCompatibility' instead.", true)] + [ExcludeFromCodeCoverage] public void UseAlternativeEscapeChar(char alternativeEscapeChar = '\\') { if (alternativeEscapeChar != _parserSettings.CharLiteralEscapeChar) @@ -132,6 +134,7 @@ public void UseAlternativeEscapeChar(char alternativeEscapeChar = '\\') /// backslash. /// [Obsolete("Use 'Settings.StringFormatCompatibility' instead.", true)] + [ExcludeFromCodeCoverage] public void UseBraceEscaping() { throw new NotSupportedException($"Init-only property {nameof(Settings)}.{nameof(Settings.StringFormatCompatibility)} can only be set in an object initializer"); @@ -143,6 +146,7 @@ public void UseBraceEscaping() /// /// [Obsolete("This feature has been removed", true)] + [ExcludeFromCodeCoverage] public void UseAlternativeBraces(char opening, char closing) { throw new NotSupportedException("This feature has been removed"); @@ -242,19 +246,19 @@ private void ProcessLiteralText(char inputChar, ParserState state, ParsingErrors return; } - if (inputChar == _parserSettings.PlaceholderBeginChar) + if (inputChar == ParserSettings.PlaceholderBeginChar) { AddLiteralCharsParsedBefore(state); - if (EscapeLikeStringFormat(_parserSettings.PlaceholderBeginChar, state)) return; + if (EscapeLikeStringFormat(ParserSettings.PlaceholderBeginChar, state)) return; // Context transition CreateNewPlaceholder(ref nestedDepth, state, out currentPlaceholder); currentContext = ParseContext.SelectorHeader; } - else if (inputChar == _parserSettings.PlaceholderEndChar) + else if (inputChar == ParserSettings.PlaceholderEndChar) { AddLiteralCharsParsedBefore(state); - if (EscapeLikeStringFormat(_parserSettings.PlaceholderEndChar, state)) return; + if (EscapeLikeStringFormat(ParserSettings.PlaceholderEndChar, state)) return; if (HasProcessedTooManyClosingBraces(parsingErrors, state)) return; // End of a nested placeholder's Format. @@ -295,7 +299,7 @@ private void ProcessSelector(char inputChar, ParserState state, ParsingErrors pa } state.Index.LastEnd = state.Index.SafeAdd(state.Index.Current, 1); } - else if (inputChar == _parserSettings.FormatterNameSeparator) + else if (inputChar == ParserSettings.FormatterNameSeparator) { AddLastSelector(ref currentPlaceholder, state, parsingErrors); @@ -311,7 +315,7 @@ private void ProcessSelector(char inputChar, ParserState state, ParsingErrors pa // We are now parsing the literal text *inside* the placeholder's format. currentContext = ParseContext.LiteralText; } - else if (inputChar == _parserSettings.PlaceholderEndChar) + else if (inputChar == ParserSettings.PlaceholderEndChar) { AddLastSelector(ref currentPlaceholder, state, parsingErrors); @@ -326,11 +330,28 @@ private void ProcessSelector(char inputChar, ParserState state, ParsingErrors pa else { // Ensure the selector characters are valid: - if (!_validSelectorChars.Contains(inputChar)) - parsingErrors.AddIssue(state.ResultFormat, - $"'0x{Convert.ToUInt32(inputChar):X}': " + - _parsingErrorText[ParsingError.InvalidCharactersInSelector], - state.Index.Current, state.Index.SafeAdd(state.Index.Current, 1)); + if (_selectorChars.IsAllowList) + { + // Only allow specific characters + if (!_selectorChars.Contains(inputChar)) + { + parsingErrors.AddIssue(state.ResultFormat, + $"'0x{Convert.ToUInt32(inputChar):X}': " + + _parsingErrorText[ParsingError.InvalidCharactersInSelector], + state.Index.Current, state.Index.SafeAdd(state.Index.Current, 1)); + } + } + else + { + // Blocklist: Disallow specific characters + if (_selectorChars.Contains(inputChar)) + { + parsingErrors.AddIssue(state.ResultFormat, + $"'0x{Convert.ToUInt32(inputChar):X}': " + + _parsingErrorText[ParsingError.InvalidCharactersInSelector], + state.Index.Current, state.Index.SafeAdd(state.Index.Current, 1)); + } + } } } @@ -468,8 +489,8 @@ private void ParseAlternativeEscaping(ParserState state) throw new ArgumentException($"Unrecognized escape sequence at the end of the literal"); // **** Alternative brace escaping with { or } following the escape character **** - if (state.InputFormat[indexNextChar] == _parserSettings.PlaceholderBeginChar || - state.InputFormat[indexNextChar] == _parserSettings.PlaceholderEndChar) + if (state.InputFormat[indexNextChar] == ParserSettings.PlaceholderBeginChar || + state.InputFormat[indexNextChar] == ParserSettings.PlaceholderEndChar) { // Finish the last text item: if (state.Index.Current != state.Index.LastEnd) @@ -512,7 +533,7 @@ private void ParseAlternativeEscaping(ParserState state) private bool ParseNamedFormatter(ParserState state) { var inputChar = state.InputFormat[state.Index.Current]; - if (inputChar == _parserSettings.FormatterOptionsBeginChar) + if (inputChar == ParserSettings.FormatterOptionsBeginChar) { var emptyName = state.Index.NamedFormatterStart == state.Index.Current; if (emptyName) @@ -524,16 +545,16 @@ private bool ParseNamedFormatter(ParserState state) // Note: This short-circuits the Parser.ParseFormat main loop ParseFormatOptions(state); } - else if (inputChar == _parserSettings.FormatterOptionsEndChar || inputChar == _parserSettings.FormatterNameSeparator) + else if (inputChar == ParserSettings.FormatterOptionsEndChar || inputChar == ParserSettings.FormatterNameSeparator) { - if (inputChar == _parserSettings.FormatterOptionsEndChar) + if (inputChar == ParserSettings.FormatterOptionsEndChar) { var hasOpeningParenthesis = state.Index.NamedFormatterOptionsStart != PositionUndefined; // ensure no trailing chars past ')' var nextCharIndex = state.Index.SafeAdd(state.Index.Current, 1); var nextCharIsValid = nextCharIndex < state.InputFormat.Length && - (state.InputFormat[nextCharIndex] == _parserSettings.FormatterNameSeparator || state.InputFormat[nextCharIndex] == _parserSettings.PlaceholderEndChar); + (state.InputFormat[nextCharIndex] == ParserSettings.FormatterNameSeparator || state.InputFormat[nextCharIndex] == ParserSettings.PlaceholderEndChar); if (!hasOpeningParenthesis || !nextCharIsValid) { @@ -543,7 +564,7 @@ private bool ParseNamedFormatter(ParserState state) state.Index.NamedFormatterOptionsEnd = state.Index.Current; - if (state.InputFormat[nextCharIndex] == _parserSettings.FormatterNameSeparator) state.Index.Current++; + if (state.InputFormat[nextCharIndex] == ParserSettings.FormatterNameSeparator) state.Index.Current++; } var nameIsEmpty = state.Index.NamedFormatterStart == state.Index.Current; @@ -604,8 +625,8 @@ private void AddLastSelector(ref Placeholder currentPlaceholder, ParserState sta if (state.Index.Current != state.Index.LastEnd || currentPlaceholder.Selectors.Count > 0 && currentPlaceholder.Selectors[currentPlaceholder.Selectors.Count - 1].Length > 0 && state.Index.Current - state.Index.Operator == 1 && - (state.InputFormat[state.Index.Operator] == _parserSettings.ListIndexEndChar || - state.InputFormat[state.Index.Operator] == _parserSettings.NullableOperator)) + (state.InputFormat[state.Index.Operator] == ParserSettings.ListIndexEndChar || + state.InputFormat[state.Index.Operator] == ParserSettings.NullableOperator)) currentPlaceholder.AddSelector(SelectorPool.Instance.Get().Initialize(Settings, currentPlaceholder, state.InputFormat, state.Index.LastEnd, state.Index.Current, state.Index.Operator, state.Index.Selector)); else if (state.Index.Operator != state.Index.Current) parsingErrors.AddIssue(state.ResultFormat, diff --git a/src/SmartFormat/Core/Parsing/Placeholder.cs b/src/SmartFormat/Core/Parsing/Placeholder.cs index 7b464ade..d31a48d7 100644 --- a/src/SmartFormat/Core/Parsing/Placeholder.cs +++ b/src/SmartFormat/Core/Parsing/Placeholder.cs @@ -5,6 +5,7 @@ using System; using System.Buffers; using System.Collections.Generic; +using SmartFormat.Core.Settings; using SmartFormat.Pooling.ObjectPools; using SmartFormat.Pooling.SmartPools; using SmartFormat.ZString; @@ -138,7 +139,7 @@ internal void AddSelector(Selector selector) // 1. The operator character must have a value, usually ',' // 2. The alignment is an integer value if (selector.OperatorLength > 0 - && selector.Operator[0] == SmartSettings.Parser.AlignmentOperator + && selector.Operator[0] == ParserSettings.AlignmentOperator && int.TryParse(selector.RawText, out var alignment)) { Alignment = alignment; @@ -231,41 +232,41 @@ public override string ToString() using var buffer = new ZCharArray(Length + 2); // +2 for the braces - buffer.Write(SmartSettings.Parser.PlaceholderBeginChar); + buffer.Write(ParserSettings.PlaceholderBeginChar); foreach (var s in Selectors) { // alignment operators will be appended later - if (s.Operator.Length > 0 && s.Operator[0] == SmartSettings.Parser.AlignmentOperator) continue; + if (s.Operator.Length > 0 && s.Operator[0] == ParserSettings.AlignmentOperator) continue; var selectorSpan = s.BaseString.AsSpan(s.OperatorStartIndex, s.EndIndex - s.OperatorStartIndex); buffer.Write(selectorSpan); } if (Alignment != 0) { - buffer.Write(SmartSettings.Parser.AlignmentOperator); + buffer.Write(ParserSettings.AlignmentOperator); buffer.Write(Alignment.ToString()); } if (FormatterName != string.Empty) { - buffer.Write(SmartSettings.Parser.FormatterNameSeparator); + buffer.Write(ParserSettings.FormatterNameSeparator); buffer.Write(FormatterName); if (FormatterOptions != string.Empty) { - buffer.Write(SmartSettings.Parser.FormatterOptionsBeginChar); + buffer.Write(ParserSettings.FormatterOptionsBeginChar); buffer.Write(FormatterOptions); - buffer.Write(SmartSettings.Parser.FormatterOptionsEndChar); + buffer.Write(ParserSettings.FormatterOptionsEndChar); } } if (Format != null) { - buffer.Write(SmartSettings.Parser.FormatterNameSeparator); + buffer.Write(ParserSettings.FormatterNameSeparator); buffer.Write(Format.AsSpan()); } - buffer.Write(SmartSettings.Parser.PlaceholderEndChar); + buffer.Write(ParserSettings.PlaceholderEndChar); #if NETSTANDARD2_1 || NET6_0_OR_GREATER _toStringCache = new string(buffer.GetSpan()); diff --git a/src/SmartFormat/Core/Settings/ParserSettings.cs b/src/SmartFormat/Core/Settings/ParserSettings.cs index f454d302..291325d7 100644 --- a/src/SmartFormat/Core/Settings/ParserSettings.cs +++ b/src/SmartFormat/Core/Settings/ParserSettings.cs @@ -16,10 +16,11 @@ namespace SmartFormat.Core.Settings; /// public class ParserSettings { - private readonly List _alphanumericSelectorChars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-".ToList(); + private readonly List _customSelectorChars = []; + private readonly List _customOperatorChars = []; + private SelectorFilterType _selectorCharFilter = SelectorFilterType.Alphanumeric; - private readonly List _customSelectorChars = new List(); - private readonly List _customOperatorChars = new List(); + private const string StandardAllowlist = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-"; /// /// Gets or sets the to use for the . @@ -28,67 +29,74 @@ public class ParserSettings public ParseErrorAction ErrorAction { get; set; } = ParseErrorAction.ThrowError; /// - /// The list of standard selector characters. + /// Gets a read-only list of the custom selector characters, which were set with . /// - internal List SelectorChars() => _alphanumericSelectorChars; + internal List CustomSelectorChars => _customSelectorChars; /// - /// Gets a read-only list of the custom selector characters, which were set with . + /// Gets a list of the custom operator characters, which were set with . + /// Contiguous operator characters are parsed as one operator (e.g. '?.'). /// - internal List CustomSelectorChars() => _customSelectorChars; + internal List CustomOperatorChars => _customOperatorChars; /// - /// Gets a list of characters which are allowed in a selector. + /// When (default) is set, an allowlist of selector characters is used. + /// The allowlist contains alphanumeric characters (upper and lower case), plus '_' and '-'. + /// On top, any custom selector characters added with are included. + /// + /// When is set, all Unicode characters are allowed in a selector, + /// except 68 non-visual characters: Control Characters (U+0000–U+001F, U+007F), Format Characters (Category: Cf), + /// Directional Formatting (Category: Cf), Invisible Separator, Common Combining Marks (Category: Mn), + /// Whitespace Characters (non-glyph spacing).
+ /// Excluded characters can be added back using . + /// + /// {}[]()\.? are characters with special functions that are never allowed. + /// + /// Changing this setting clears any custom operator characters added with . ///
- internal List DisallowedSelectorChars() + public SelectorFilterType SelectorCharFilter { - var chars = new List { - CharLiteralEscapeChar, FormatterNameSeparator, AlignmentOperator, SelectorOperator, - PlaceholderBeginChar, PlaceholderEndChar, FormatterOptionsBeginChar, FormatterOptionsEndChar - }; - chars.AddRange(OperatorChars()); - return chars; + get + { + return _selectorCharFilter; + } + set + { + _selectorCharFilter = value; + _customOperatorChars.Clear(); + } } /// - /// Gets a read-only list of the custom operator characters, which were set with . - /// Contiguous operator characters are parsed as one operator (e.g. '?.'). + /// The list of characters for a selector. + /// This can be an allowlist, which contains explicitly allowed characters, + /// or a blocklist, when all Unicode characters are allowed, except those from the blocklist. /// - internal List CustomOperatorChars() => _customOperatorChars; + internal CharSet GetSelectorChars() => SelectorCharFilter == SelectorFilterType.Alphanumeric ? CreateAllowlist() : CreateBlocklist(); - /// - /// Add a list of allowable selector characters on top of the setting. - /// This can be useful to support additional selector syntax such as math. - /// Characters in cannot be added. - /// Operator chars and selector chars must be different. - /// - public void AddCustomSelectorChars(IList characters) + private CharSet CreateBlocklist() { - foreach (var c in characters) - { - if (DisallowedSelectorChars().Contains(c) || _customOperatorChars.Contains(c)) - throw new ArgumentException($"Cannot add '{c}' as a custom selector character. It is disallowed or in use as an operator."); + var chars = new CharSet { + CharLiteralEscapeChar // avoid confusion with escape sequences + }; + chars.IsAllowList = false; + chars.AddRange(SelectorDelimitingChars.AsSpan()); + chars.AddRange(OperatorChars.AsSpan()); // no overlaps + chars.AddRange(_customOperatorChars); // no overlaps + chars.AddRange(NonVisualUnicodeCharacters.AsSpan()); - if (!_customSelectorChars.Contains(c) && !_alphanumericSelectorChars.Contains(c)) - _customSelectorChars.Add(c); - } + // Remove characters used as custom selector chars from the blocklist + foreach (var c in _customSelectorChars) chars.Remove(c); + return chars; } - /// - /// Add a list of allowable operator characters on top of the standard setting. - /// Operator chars and selector chars must be different. - /// - public void AddCustomOperatorChars(IList characters) + private CharSet CreateAllowlist() { - foreach (var c in characters) - { - if(DisallowedSelectorChars().Where(_ => OperatorChars().TrueForAll(ch => ch != c)).Contains(c) || - SelectorChars().Contains(c) || CustomSelectorChars().Contains(c)) - throw new ArgumentException($"Cannot add '{c}' as a custom operator character. It is disallowed or in use as a selector."); - - if (!OperatorChars().Contains(c) && !CustomOperatorChars().Contains(c)) - _customOperatorChars.Add(c); - } + var chars = new CharSet {IsAllowList = true}; + chars.AddRange(StandardAllowlist.AsSpan()); + // Add characters used as custom selector chars to the allowlist + chars.AddRange(_customSelectorChars); + return chars; } /// @@ -99,8 +107,7 @@ public void AddCustomOperatorChars(IList characters) /// string.Format(@"\t") will return the 2 characters "\" and "t" /// public bool ConvertCharacterStringLiterals { get; set; } = true; - - + /// /// Experimental. /// Gets or sets, whether the input format should be interpreted as HTML. @@ -126,68 +133,211 @@ public void AddCustomOperatorChars(IList characters) /// The character which separates the formatter name (if any exists) from other parts of the placeholder. /// E.g.: {Variable:FormatterName:argument} or {Variable:FormatterName} /// - internal char FormatterNameSeparator { get; } = ':'; - - /// - /// The standard operator characters. - /// Contiguous operator characters are parsed as one operator (e.g. '?.'). - /// - internal List OperatorChars() => new() - {SelectorOperator, NullableOperator, AlignmentOperator, ListIndexBeginChar, ListIndexEndChar}; + internal const char FormatterNameSeparator = ':'; /// /// The character which separates the selector for alignment. E.g.: Smart.Format("Name: {name,10}") /// - internal char AlignmentOperator { get; } = ','; + internal const char AlignmentOperator = ','; /// /// The character which separates two or more selectors E.g.: "First.Second.Third" /// - internal char SelectorOperator { get; } = '.'; + internal const char SelectorOperator = '.'; /// /// The character which flags the selector as . /// The character after must be the . /// E.g.: "First?.Second" /// - internal char NullableOperator { get; } = '?'; + internal const char NullableOperator = '?'; /// /// Gets the character indicating the start of a . /// - internal char PlaceholderBeginChar { get; } = '{'; + internal const char PlaceholderBeginChar = '{'; /// /// Gets the character indicating the end of a . /// - internal char PlaceholderEndChar { get; } = '}'; + internal const char PlaceholderEndChar = '}'; /// - /// Gets the character indicating the begin of formatter options. + /// Gets the character indicating the beginning of formatter options. /// - internal char FormatterOptionsBeginChar { get; } = '('; + internal const char FormatterOptionsBeginChar = '('; /// /// Gets the character indicating the end of formatter options. /// - internal char FormatterOptionsEndChar { get; } = ')'; + internal const char FormatterOptionsEndChar = ')'; /// - /// Gets the character indicating the begin of a list index, like in "{Numbers[0]}" + /// Gets the character indicating the beginning of a list index, like in '{Numbers[0]}' /// - internal char ListIndexBeginChar { get; } = '['; + internal const char ListIndexBeginChar = '['; /// - /// Gets the character indicating the end of a list index, like in "{Numbers[0]}" + /// Gets the character indicating the end of a list index, like in '{Numbers[0]}' /// - internal char ListIndexEndChar { get; } = ']'; + internal const char ListIndexEndChar = ']'; /// /// Characters which terminate parsing of format options. /// To use them as options, they must be escaped (preceded) by the . /// - internal List FormatOptionsTerminatorChars() => new() { + internal static readonly char[] FormatOptionsTerminatorChars = + [ FormatterNameSeparator, FormatterOptionsBeginChar, FormatterOptionsEndChar, PlaceholderBeginChar, PlaceholderEndChar - }; + ]; + + /// + /// The standard operator characters. + /// Contiguous operator characters are parsed as one operator (e.g. '?.'). + /// + internal static readonly char[] OperatorChars = + [ + SelectorOperator, NullableOperator, AlignmentOperator, ListIndexBeginChar, ListIndexEndChar + ]; + + /// + /// The list of characters which are delimiting a selector. + /// + internal static readonly char[] SelectorDelimitingChars = + [ + FormatterNameSeparator, + PlaceholderBeginChar, PlaceholderEndChar, + FormatterOptionsBeginChar, FormatterOptionsEndChar + ]; + + /// + /// All 68 non-visual Unicode characters that are typically not used in selectors. + /// + internal static readonly char[] NonVisualUnicodeCharacters = + [ + // Control Characters (U+0000–U+001F, U+007F) + '\u0000', // NULL – string terminator + '\u0001', // START OF HEADING – protocol control + '\u0002', // START OF TEXT – protocol control + '\u0003', // END OF TEXT – protocol control + '\u0004', // END OF TRANSMISSION – protocol control + '\u0005', // ENQUIRY – request for response + '\u0006', // ACKNOWLEDGE – positive response + '\u0007', // BELL – triggers alert + '\u0008', // BACKSPACE – moves cursor back + '\u0009', // CHARACTER TABULATION – horizontal tab + '\u000A', // LINE FEED – line break + '\u000B', // LINE TABULATION – vertical tab + '\u000C', // FORM FEED – page break + '\u000D', // CARRIAGE RETURN – return to line start + '\u000E', // SHIFT OUT – alternate character set + '\u000F', // SHIFT IN – return to standard set + '\u0010', // DATA LINK ESCAPE – protocol framing + '\u0011', // DEVICE CONTROL 1 – device-specific + '\u0012', // DEVICE CONTROL 2 – device-specific + '\u0013', // DEVICE CONTROL 3 – device-specific + '\u0014', // DEVICE CONTROL 4 – device-specific + '\u0015', // NEGATIVE ACKNOWLEDGE – error signal + '\u0016', // SYNCHRONOUS IDLE – timing control + '\u0017', // END OF TRANSMISSION BLOCK – block end + '\u0018', // CANCEL – cancel transmission + '\u0019', // END OF MEDIUM – physical medium end + '\u001A', // SUBSTITUTE – invalid character + '\u001B', // ESCAPE – escape sequence initiator + '\u001C', // FILE SEPARATOR – data structuring + '\u001D', // GROUP SEPARATOR – data structuring + '\u001E', // RECORD SEPARATOR – data structuring + '\u001F', // UNIT SEPARATOR – data structuring + '\u007F', // DELETE – erase character + + // Format Characters (Category: Cf) + '\u200B', // ZERO WIDTH SPACE – invisible space + '\u200C', // ZERO WIDTH NON-JOINER – prevents ligature + '\u200D', // ZERO WIDTH JOINER – forces ligature + '\u2060', // WORD JOINER – prevents line break + '\uFEFF', // ZERO WIDTH NO-BREAK SPACE – BOM or NBSP + + // Directional Formatting (Category: Cf) + '\u202A', // LEFT-TO-RIGHT EMBEDDING – sets LTR context + '\u202B', // RIGHT-TO-LEFT EMBEDDING – sets RTL context + '\u202C', // POP DIRECTIONAL FORMATTING – ends override + '\u202D', // LEFT-TO-RIGHT OVERRIDE – forces LTR rendering + '\u202E', // RIGHT-TO-LEFT OVERRIDE – forces RTL rendering + '\u2066', // LEFT-TO-RIGHT ISOLATE – isolates LTR segment + '\u2067', // RIGHT-TO-LEFT ISOLATE – isolates RTL segment + '\u2068', // FIRST STRONG ISOLATE – isolates with inferred direction + '\u2069', // POP DIRECTIONAL ISOLATE – ends isolate + + // Invisible Separator + '\u2063', // INVISIBLE SEPARATOR – semantic boundary marker + + // Common Combining Marks (Category: Mn) + '\u0300', // COMBINING GRAVE ACCENT – diacritic (invisible alone) + '\u0301', // COMBINING ACUTE ACCENT – diacritic (invisible alone) + '\u0302', // COMBINING CIRCUMFLEX ACCENT – diacritic (invisible alone) + '\u0308', // COMBINING DIAERESIS – diacritic (invisible alone) + + // Whitespace Characters (non-glyph spacing) + '\u00A0', // NO-BREAK SPACE – non-breaking space + '\u1680', // OGHAM SPACE MARK – special spacing + '\u2000', // EN QUAD – fixed-width space + '\u2001', // EM QUAD – fixed-width space + '\u2002', // EN SPACE – fixed-width space + '\u2003', // EM SPACE – fixed-width space + '\u2004', // THREE-PER-EM SPACE – narrow space + '\u2005', // FOUR-PER-EM SPACE – narrow space + '\u2006', // SIX-PER-EM SPACE – narrow space + '\u2007', // FIGURE SPACE – aligns digits + '\u2008', // PUNCTUATION SPACE – aligns punctuation + '\u2009', // THIN SPACE – narrow space + '\u200A', // HAIR SPACE – ultra-thin space + '\u202F', // NARROW NO-BREAK SPACE – narrow NBSP + '\u205F', // MEDIUM MATHEMATICAL SPACE – math spacing + '\u3000' // IDEOGRAPHIC SPACE – full-width CJK space + ]; + + /// + /// Add a list of allowable selector characters on top of the default selector characters. + /// + /// When is (default), an allowlist of selector characters is used. + /// The allowlist contains alphanumeric characters (upper and lower case), plus '_' and '-'. + /// On top, any custom selector characters added with are included. + /// + /// When is , all Unicode characters are allowed in a selector, + /// except 68 non-visual characters. Excluded characters can be added back using . + /// + /// Operator chars and selector chars must be different. + /// + public void AddCustomSelectorChars(IList characters) + { + foreach (var c in characters) + { + // Explicitly disallow certain characters + if (SelectorDelimitingChars.Contains(c) || c == CharLiteralEscapeChar + || OperatorChars.Contains(c) || CustomOperatorChars.Contains(c)) + throw new ArgumentException($"Cannot add '{c}' as a custom selector character. It is disallowed or in use as an operator character."); + + if (NonVisualUnicodeCharacters.Contains(c)) + _customSelectorChars.Add(c); + + if (SelectorCharFilter == SelectorFilterType.Alphanumeric && !(StandardAllowlist.Contains(c) || _customSelectorChars.Contains(c))) _customSelectorChars.Add(c); + } + } + + /// + /// Add a list of allowable operator characters on top of the standard setting. + /// Operator chars and selector chars must be different. + /// + public void AddCustomOperatorChars(IList characters) + { + foreach (var c in characters) + { + if (SelectorDelimitingChars.Contains(c) || CustomSelectorChars.Contains(c)) + throw new ArgumentException($"Cannot add '{c}' as a custom operator character. It is disallowed or in use as a selector."); + + if (!OperatorChars.Contains(c) && !_customOperatorChars.Contains(c)) + _customOperatorChars.Add(c); + } + } } diff --git a/src/SmartFormat/Core/Settings/SelectorFilterType.cs b/src/SmartFormat/Core/Settings/SelectorFilterType.cs new file mode 100644 index 00000000..b975b987 --- /dev/null +++ b/src/SmartFormat/Core/Settings/SelectorFilterType.cs @@ -0,0 +1,27 @@ +// +// Copyright SmartFormat Project maintainers and contributors. +// Licensed under the MIT license. + +namespace SmartFormat.Core.Settings; + +/// +/// Determines the filter type for allowed or disallowed characters. +/// +public enum SelectorFilterType +{ + /// + /// Use a list of characters that are allowed. The default characters are
+ /// alphanumeric characters (upper and lower case), plus '_' and '-'.
+ ///
+ Alphanumeric, + + /// + /// All Unicode characters are allowed in a selector, except 68 non-visual characters: + /// Control Characters (U+0000–U+001F, U+007F), Format Characters (Category: Cf), + /// Directional Formatting (Category: Cf), Invisible Separator, Common Combining Marks (Category: Mn), + /// Whitespace Characters (non-glyph spacing). + /// + /// {}[]()\.? are characters with special functions that are never allowed. + /// + VisualUnicodeChars +} diff --git a/src/SmartFormat/Core/Settings/SmartSettings.cs b/src/SmartFormat/Core/Settings/SmartSettings.cs index 5c035f56..4acf548c 100644 --- a/src/SmartFormat/Core/Settings/SmartSettings.cs +++ b/src/SmartFormat/Core/Settings/SmartSettings.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; namespace SmartFormat.Core.Settings; @@ -45,6 +46,7 @@ public SmartSettings() /// The default is . ///
[Obsolete("Use 'SmartSettings.Formatter.ErrorAction' instead.", true)] + [ExcludeFromCodeCoverage] public ErrorAction FormatErrorAction { get => (ErrorAction) Formatter.ErrorAction; @@ -56,6 +58,7 @@ public ErrorAction FormatErrorAction /// The default is . /// [Obsolete("Use 'SmartSettings.Parser.ErrorAction' instead.", true)] + [ExcludeFromCodeCoverage] public ErrorAction ParseErrorAction { get => (ErrorAction) Parser.ErrorAction; @@ -76,6 +79,7 @@ public ErrorAction ParseErrorAction /// string.Format(@"\t") will return the 2 characters "\" and "t" /// [Obsolete("Use SmartSettings.Parser.ConvertCharacterStringLiterals instead", true)] + [ExcludeFromCodeCoverage] public bool ConvertCharacterStringLiterals { get => Parser.ConvertCharacterStringLiterals; @@ -126,4 +130,4 @@ public StringComparison GetCaseSensitivityComparison() /// These settings must be defined before any class calling the object pools is instantiated. They cannot be changed later. /// public PoolSettings Pooling { get; set; } -} \ No newline at end of file +} diff --git a/src/SmartFormat/Evaluator.cs b/src/SmartFormat/Evaluator.cs index d2e85a5c..2418609a 100644 --- a/src/SmartFormat/Evaluator.cs +++ b/src/SmartFormat/Evaluator.cs @@ -204,7 +204,7 @@ private void HandleNestedScope(FormattingInfo formattingInfo, Selector selector, /// /// Skip empty selectors and alignment-only selectors. /// - private bool SkipThisSelector(Selector selector) + private static bool SkipThisSelector(Selector selector) { // Don't evaluate empty selectors // (used e.g. for Settings.Parser.NullableOperator and Settings.Parser.ListIndexEndChar final operators) @@ -212,7 +212,7 @@ private bool SkipThisSelector(Selector selector) // Do not evaluate alignment-only selectors if (selector.Operator.Length > 0 && - selector.Operator[0] == _settings.Parser.AlignmentOperator) return true; + selector.Operator[0] == ParserSettings.AlignmentOperator) return true; return false; } diff --git a/src/SmartFormat/Extensions/ListFormatter.cs b/src/SmartFormat/Extensions/ListFormatter.cs index f3fc6323..13689d88 100644 --- a/src/SmartFormat/Extensions/ListFormatter.cs +++ b/src/SmartFormat/Extensions/ListFormatter.cs @@ -300,14 +300,14 @@ private static void WriteSpacer(FormattingInfo formattingInfo, Format spacer, ob /// /// The nullable operator '?' can be followed by a dot (like '?.') or a square brace (like '?[') /// - private bool HasNullableOperator(IFormattingInfo formattingInfo) + private static bool HasNullableOperator(IFormattingInfo formattingInfo) { if (formattingInfo.Placeholder != null) { #pragma warning disable S3267 // Don't use LINQ in favor of less GC foreach (var s in formattingInfo.Placeholder.Selectors) { - if (s.OperatorLength > 0 && s.BaseString[s.OperatorStartIndex] == _smartSettings.Parser.NullableOperator) + if (s.OperatorLength > 0 && s.BaseString[s.OperatorStartIndex] == ParserSettings.NullableOperator) return true; } #pragma warning restore S3267 // Restore: Loops should be simplified with "LINQ" expressions