Skip to content

Commit a2341a4

Browse files
committed
Add Unicode symbol characters to operator charset.
Remove '@' from the operator character set, but add the math, symbol, punctuation, arrow, and line- and box-drawing characters. Also allow operators to contain (but not start with) combining characters--this should be safe, because identifiers can't start with combining characters either, and we don't allow them anywhere else. Swift SVN r5019
1 parent 3725b4f commit a2341a4

File tree

6 files changed

+111
-38
lines changed

6 files changed

+111
-38
lines changed

docs/ABI.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ there's no ambiguity with the run-length.
171171
::
172172

173173
identifier ::= 'X' natural identifier-start-char identifier-char*
174+
identifier ::= 'X' 'o' operator-fixity natural identifier-char*
174175

175176
Identifiers that contain non-ASCII characters are encoded using the Punycode
176177
algorithm specified in RFC 3492, with the modifications that ``_`` is used
@@ -181,6 +182,14 @@ encoded string itself. For example, the identifier ``vergüenza`` is mangled
181182
to ``X12vergenza_JFa``. (The encoding in standard Punycode would be
182183
``vergenza-95a``)
183184

185+
Operators that contain non-ASCII characters are mangled by first mapping the
186+
ASCII operator characters to letters as for pure ASCII operator names, then
187+
Punycode-encoding the substituted string. The mangling then consists of
188+
``Xo`` followed by the fixity, run length of the encoded string, and the encoded
189+
string itself. For example, the infix operator ``«+»`` is mangled to
190+
``Xoi7p_qcaDc`` (``p_qcaDc`` being the encoding of the substituted
191+
string ``«p»``).
192+
184193
::
185194

186195
substitution ::= 'S' index

include/swift/AST/Identifier.h

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "swift/Basic/LLVM.h"
2121
#include "llvm/ADT/DenseMapInfo.h"
2222
#include "llvm/ADT/StringRef.h"
23+
#include "llvm/Support/ConvertUTF.h"
2324
#include <cstring>
2425

2526
namespace llvm {
@@ -67,15 +68,58 @@ class Identifier {
6768

6869
/// isOperator - Return true if this identifier is an operator, false if it is
6970
/// a normal identifier.
71+
/// FIXME: We should maybe cache this.
7072
bool isOperator() const {
71-
return !empty() && isOperatorChar(Pointer[0]);
73+
if (empty())
74+
return false;
75+
if ((unsigned char)Pointer[0] < 0x80)
76+
return isOperatorStartCodePoint((unsigned char)Pointer[0]);
77+
78+
StringRef data = str();
79+
auto *s = reinterpret_cast<UTF8 const *>(data.begin()),
80+
*end = reinterpret_cast<UTF8 const *>(data.end());
81+
UTF32 codePoint;
82+
ConversionResult res = llvm::convertUTF8Sequence(&s, end, &codePoint,
83+
strictConversion);
84+
assert(res == conversionOK && "invalid UTF-8 in identifier?!");
85+
return !empty() && isOperatorStartCodePoint(codePoint);
7286
}
7387

74-
/// isOperatorChar - Return true if the specified character is a
75-
/// valid part of an operator.
76-
static bool isOperatorChar(char C) {
77-
static const char OpChars[] = "@/=-+*%<>!&|^~.";
78-
return memchr(OpChars, C, sizeof(OpChars) - 1) != 0;
88+
/// isOperatorStartCodePoint - Return true if the specified code point is a
89+
/// valid start of an operator.
90+
static bool isOperatorStartCodePoint(uint32_t C) {
91+
// ASCII operator chars.
92+
static const char OpChars[] = "/=-+*%<>!&|^~.";
93+
if (C < 0x80)
94+
return memchr(OpChars, C, sizeof(OpChars) - 1) != 0;
95+
96+
// Unicode math, symbol, arrow, dingbat, and line/box drawing chars.
97+
return (C >= 0x00A1 && C <= 0x00A7)
98+
|| C == 0x00A9 || C == 0x00AB || C == 0x00AC || C == 0x00AE
99+
|| C == 0x00B0 || C == 0x00B1 || C == 0x00B6 || C == 0x00BB
100+
|| C == 0x00BF || C == 0x00D7 || C == 0x00F7
101+
|| C == 0x2016 || C == 0x2017 || (C >= 0x2020 && C <= 0x2027)
102+
|| (C >= 0x2030 && C <= 0x203E) || (C >= 0x2041 && C <= 0x2053)
103+
|| (C >= 0x2055 && C <= 0x205E) || (C >= 0x2190 && C <= 0x23FF)
104+
|| (C >= 0x2500 && C <= 0x2775) || (C >= 0x2794 && C <= 0x2BFF)
105+
|| (C >= 0x2E00 && C <= 0x2E7F) || (C >= 0x3001 && C <= 0x3003)
106+
|| (C >= 0x3008 && C <= 0x3030);
107+
}
108+
109+
/// isOperatorContinuationCodePoint - Return true if the specified code point
110+
/// is a valid operator code point.
111+
static bool isOperatorContinuationCodePoint(uint32_t C) {
112+
// '.' is a special case. It can only appear in '..'.
113+
if (C == '.')
114+
return false;
115+
if (isOperatorStartCodePoint(C))
116+
return true;
117+
118+
// Unicode combining characters.
119+
return (C >= 0x0300 && C <= 0x036F)
120+
|| (C >= 0x1DC0 && C <= 0x1DFF)
121+
|| (C >= 0x20D0 && C <= 0x20FF)
122+
|| (C >= 0xFE20 && C <= 0xFE2F);
79123
}
80124

81125
void *getAsOpaquePointer() const { return (void *)Pointer; }

lib/Parse/Lexer.cpp

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ static bool isValidIdentifierStartCodePoint(uint32_t c) {
376376

377377
// N1518: Recommendations for extended identifier characters for C and C++
378378
// Proposed Annex X.2: Ranges of characters disallowed initially
379-
if ((c >= 0x0300 && c <= 0x36F)
379+
if ((c >= 0x0300 && c <= 0x036F)
380380
|| (c >= 0x1DC0 && c <= 0x1DFF)
381381
|| (c >= 0x20D0 && c <= 0x20FF)
382382
|| (c >= 0xFE20 && c <= 0xFE2F))
@@ -385,30 +385,39 @@ static bool isValidIdentifierStartCodePoint(uint32_t c) {
385385
return true;
386386
}
387387

388-
static bool advanceIfValidContinuationOfIdentifier(char const *&ptr,
389-
char const *end) {
388+
static bool advanceIf(char const *&ptr,
389+
char const *end,
390+
bool (*predicate)(uint32_t)) {
390391
char const *next = ptr;
391392
uint32_t c = validateUTF8CharacterAndAdvance(next, end);
392393
if (c == ~0U)
393394
return false;
394-
if (isValidIdentifierContinuationCodePoint(c)) {
395+
if (predicate(c)) {
395396
ptr = next;
396397
return true;
397398
}
398399
return false;
400+
399401
}
400402

401403
static bool advanceIfValidStartOfIdentifier(char const *&ptr,
402404
char const *end) {
403-
char const *next = ptr;
404-
uint32_t c = validateUTF8CharacterAndAdvance(next, end);
405-
if (c == ~0U)
406-
return false;
407-
if (isValidIdentifierStartCodePoint(c)) {
408-
ptr = next;
409-
return true;
410-
}
411-
return false;
405+
return advanceIf(ptr, end, isValidIdentifierStartCodePoint);
406+
}
407+
408+
static bool advanceIfValidContinuationOfIdentifier(char const *&ptr,
409+
char const *end) {
410+
return advanceIf(ptr, end, isValidIdentifierContinuationCodePoint);
411+
}
412+
413+
static bool advanceIfValidStartOfOperator(char const *&ptr,
414+
char const *end) {
415+
return advanceIf(ptr, end, Identifier::isOperatorStartCodePoint);
416+
}
417+
418+
static bool advanceIfValidContinuationOfOperator(char const *&ptr,
419+
char const *end) {
420+
return advanceIf(ptr, end, Identifier::isOperatorContinuationCodePoint);
412421
}
413422

414423
/// isIdentifier - Checks whether a string matches the identifier regex.
@@ -531,8 +540,12 @@ void Lexer::lexOperatorIdentifier() {
531540
return formToken(tok::unknown, TokStart);
532541
}
533542
} else {
534-
while (Identifier::isOperatorChar(*CurPtr) && *CurPtr != '.')
535-
++CurPtr;
543+
CurPtr = TokStart;
544+
bool didStart = advanceIfValidStartOfOperator(CurPtr, BufferEnd);
545+
assert(didStart && "unexpected operator start");
546+
(void) didStart;
547+
548+
while (advanceIfValidContinuationOfOperator(CurPtr, BufferEnd));
536549
}
537550

538551
// Decide between the binary, prefix, and postfix cases.
@@ -1150,6 +1163,9 @@ void Lexer::lexImpl() {
11501163
if (advanceIfValidStartOfIdentifier(tmp, BufferEnd))
11511164
return lexIdentifier();
11521165

1166+
if (advanceIfValidStartOfOperator(tmp, BufferEnd))
1167+
return lexOperatorIdentifier();
1168+
11531169
if (advanceIfValidContinuationOfIdentifier(tmp, BufferEnd)) {
11541170
// If this is a valid identifier continuation, but not a valid identifier
11551171
// start, attempt to recover by eating more continuation characters.

lib/SIL/Mangle.cpp

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ static char mangleOperatorChar(char op) {
5151
case '~': return 't'; // 'tilde'
5252
case '^': return 'x'; // 'xor'
5353
case '.': return 'z'; // 'zperiod' (the z is silent)
54-
default: llvm_unreachable("bad identifier character");
54+
default:
55+
return op;
5556
}
5657
}
5758

@@ -88,21 +89,21 @@ void Mangler::mangleIdentifier(Identifier ident, OperatorFixity fixity) {
8889
StringRef str = ident.str();
8990
assert(!str.empty() && "mangling an empty identifier!");
9091

92+
// If the identifier contains non-ASCII character, we mangle with an initial
93+
// X and Punycode the identifier string.
94+
llvm::SmallString<32> punycodeBuf;
95+
96+
if (isNonAscii(str)) {
97+
Buffer << 'X';
98+
Punycode::encodePunycode(str, punycodeBuf);
99+
str = punycodeBuf;
100+
}
101+
91102
// Mangle normal identifiers as
92103
// count identifier-char+
93104
// where the count is the number of characters in the identifier,
94105
// and where individual identifier characters represent themselves.
95106
if (!ident.isOperator()) {
96-
// If the identifier contains non-ASCII characters, Punycode it, and mangle
97-
// the encoded string as:
98-
// 'X' count identifier-char+
99-
if (isNonAscii(str)) {
100-
llvm::SmallString<32> encoded;
101-
Punycode::encodePunycode(str, encoded);
102-
Buffer << 'X' << encoded.size() << encoded;
103-
return;
104-
}
105-
106107
Buffer << str.size() << str;
107108
return;
108109
}
@@ -129,9 +130,10 @@ void Mangler::mangleIdentifier(Identifier ident, OperatorFixity fixity) {
129130
break;
130131
}
131132

133+
// Mangle ASCII operators directly.
132134
Buffer << str.size();
133-
for (unsigned i = 0, e = str.size(); i != e; ++i) {
134-
Buffer << mangleOperatorChar(str[i]);
135+
for (char c : str) {
136+
Buffer << mangleOperatorChar(c);
135137
}
136138
}
137139

tools/swift/Completion.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,16 +224,17 @@ static CompletionContext getCompletionContext(DeclContext *dc,
224224
} else if (prefix.back() == '.') {
225225
// Try to figure out a qualified lookup context from the expression
226226
// preceding the dot.
227+
// FIXME: Unicode.
227228
StringRef beforeDot = prefix.slice(0, prefix.size() - 1);
228229
prefix = StringRef();
229230
return getCompletionContextFromDotExpression(TU, dc, beforeDot);
230-
} else if (Identifier::isOperatorChar(prefix.back())) {
231+
} else if (Identifier::isOperatorStartCodePoint(prefix.back())) {
231232
// If the character preceding us looks like an operator character,
232233
// walk backward to find as much of an operator name as we can.
233234
for (char const *p = prefix.end(), *end = p, *begin = prefix.begin();
234235
p != begin;
235236
--p) {
236-
if (!Identifier::isOperatorChar(*(p-1))) {
237+
if (!Identifier::isOperatorStartCodePoint(*(p-1))) {
237238
prefix = StringRef(p, end - p);
238239
return CompletionContext(dc, tuEndLoc);
239240
}

tools/swift/Immediate.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,8 +555,9 @@ class REPLInput {
555555
++p;
556556
}
557557
while (isspace(*--p) && p >= s);
558-
if (Identifier::isOperatorChar(*p)) {
559-
while (Identifier::isOperatorChar(*p) && --p >= s);
558+
// FIXME: Unicode operators
559+
if (Identifier::isOperatorStartCodePoint(*p)) {
560+
while (Identifier::isOperatorContinuationCodePoint(*p) && --p >= s);
560561
if (*p == ' ' || *p == '\t')
561562
UnfinishedInfixExpr = true;
562563
}

0 commit comments

Comments
 (0)