Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #44 from carlor/fix-lexing

Fix lexing
  • Loading branch information...
commit db6e6c9829a67ca9e01862caab9d9c74d234bcb6 2 parents 481eab2 + aab6a90
Bernard Helyer bhelyer authored
3  .gitignore
View
@@ -56,3 +56,6 @@ build.rf
# As there is currently no good way to provide one in the repository
/bin/sdc.conf
+# For convenience
+test.sh
+
2  README.markdown
View
@@ -16,7 +16,7 @@ This list is incomplete. SDC is in a state of flux, and this is likely to be out
Lexer
-----
-* Scan and handle multiple encoding formats. __[yes.]__ -- _in so far all code is treated as UTF-8 and its BOM is eaten; other BOMs are rejected._
+* Scan and handle multiple encoding formats. __[yes.]__
* Handle leading script lines. __[yes.]__
* Split source into tokens. __[yes.]__
* Replace special tokens. __[yes.]__
3  src/sdc/lexer.d
View
@@ -47,7 +47,8 @@ TokenStream lex(Source source)
if (lexNext(tw))
continue;
- auto s = format("unexpected character: '%s'.", tw.source.peek);
+ auto s = format("unexpected character: '%s' (0x%x).", tw.source.peek,
+ cast(int)tw.source.peek);
throw new CompilerError(tw.source.location, s);
} while (tw.lastAdded.type != TokenType.End);
78 src/sdc/source.d
View
@@ -7,7 +7,9 @@ module sdc.source;
import std.file;
import std.utf;
+import std.range;
import std.string;
+import std.system;
import sdc.compilererror;
import sdc.location;
@@ -41,33 +43,30 @@ class Source
/**
- * Open the given file and validate it as a UTF-8 source.
+ * Open the given file and validate it as a Unicode source.
*
* Side-effects:
* Puts all the other fields into known good states.
*
* Throws:
- * CompilerPanic if source BOM is not valid.
- * UtfException if source is not UTF-8.
+ * UtfException if source is not valid Unicode.
*/
this(string filename)
{
- source = cast(string) std.file.read(filename);
- checkBOM();
- std.utf.validate(source);
- get();
- skipScriptLine();
+ readSource(filename);
location.filename = filename;
location.line = 1;
location.column = 1;
+
+ this(source, location);
}
/**
* Sets the source to string and the current location.
*
* Throws:
- * UtfException if the source is not valid UTF-8.
+ * UtfException if the source is not valid Unicode.
*/
this(string s, Location location)
{
@@ -75,6 +74,7 @@ class Source
std.utf.validate(source);
get();
+ skipScriptLine();
this.location = location;
}
@@ -91,28 +91,17 @@ class Source
this.mNextIndex = src.mNextIndex;
this.mCurrentIndex = src.mCurrentIndex;
}
-
+
/**
- * Validate that the current start of source has a valid UTF-8 BOM.
- *
- * Side-effects:
- * @source advanced to after valid UTF-8 BOM if found.
+ * Reads the source file into a string.
*
* Throws:
- * CompilerPanic if source if BOM is not valid.
+ * UtfException if the source is not valid Unicode.
*/
- void checkBOM()
+ void readSource(string filename)
{
- if (source.length >= 2 && source[0 .. 2] == [0xFE, 0xFF] ||
- source.length >= 2 && source[0 .. 2] == [0xFF, 0xFE] ||
- source.length >= 4 && source[0 .. 4] == [0x00, 0x00, 0xFE, 0xFF] ||
- source.length >= 4 && source[0 .. 4] == [0xFF, 0xFE, 0x00, 0x00]) {
-
- throw new CompilerPanic("only UTF-8 input is supported.");
- }
- if (source.length >= 3 && source[0 .. 3] == [0xEF, 0xBB, 0xBF]) {
- source = source[3 .. $];
- }
+ auto bts = cast(immutable(ubyte)[]) read(filename);
+ source = convertToUTF8(bts);
}
/**
@@ -231,3 +220,40 @@ class Source
this.eof = src.eof;
}
}
+
+/// Given data, it looks at the BOM to detect which encoding, and converts
+/// the text from that encoding into UTF-8.
+string convertToUTF8(const(ubyte)[] data) {
+ if (data.startsWith([0xEF, 0xBB, 0xBF]))
+ // UTF-8 (toUTF8 is for validation purposes)
+ return toUTF8(cast(string) data[3 .. $].idup);
+ else if (data.startsWith([0x00, 0x00, 0xFE, 0xFF]))
+ // UTF-32 BE
+ return convertToUTF8Impl!(dchar, Endian.bigEndian)(data);
+ else if (data.startsWith([0xFF, 0xFE, 0x00, 0x00]))
+ // UTF-32 LE
+ return convertToUTF8Impl!(dchar, Endian.littleEndian)(data);
+ else if (data.startsWith([0xFE, 0xFF]))
+ // UTF-16 BE
+ return convertToUTF8Impl!(wchar, Endian.bigEndian)(data);
+ else if (data.startsWith([0xFF, 0xFE]))
+ // UTF-16 LE
+ return convertToUTF8Impl!(wchar, Endian.littleEndian)(data);
+ else // ASCII
+ return toUTF8(cast(string)data.idup);
+}
+
+string convertToUTF8Impl(CType, Endian end)(const(ubyte)[] data) {
+ enum cpsize = CType.sizeof;
+ data = data[cpsize .. $];
+ CType[] res;
+ foreach(i; iota(0, data.length, cpsize)) {
+ auto buf = data[i .. i+cpsize].dup;
+ static if (end != endian) {
+ buf = buf.reverse;
+ }
+ res ~= *(cast(CType*)buf.ptr);
+ }
+ return toUTF8(res);
+}
+
BIN  tests/test83.d
View
Binary file not shown
Please sign in to comment.
Something went wrong with that request. Please try again.