Skip to content

Commit

Permalink
Merge a8490ec into 4a52dd9
Browse files Browse the repository at this point in the history
  • Loading branch information
mingwho committed Jun 6, 2019
2 parents 4a52dd9 + a8490ec commit 8db4194
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 25 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -2,3 +2,4 @@ vendor
builds/*
!builds/.gitkeep
.idea/
.vscode/
1 change: 1 addition & 0 deletions examples/unicode.abs
@@ -0,0 +1 @@
echo("Hello, 世界")
51 changes: 26 additions & 25 deletions lexer/lexer.go
Expand Up @@ -2,21 +2,22 @@ package lexer

import (
"strings"
"unicode"

"github.com/abs-lang/abs/token"
)

type Lexer struct {
input string
position int // current position in input (points to current char)
readPosition int // current reading position in input (after current char)
ch byte // current char under examination
ch rune // current rune under examination
input []rune
// map of input line boundaries used by linePosition() for error location
lineMap [][2]int // array of [begin, end] pairs: [[0,12], [13,22], [23,33] ... ]
}

func New(input string) *Lexer {
l := &Lexer{input: input}
func New(in string) *Lexer {
l := &Lexer{input: []rune(in)}
// map the input line boundaries for CurrentLine()
l.buildLineMap()
// read the first char
Expand Down Expand Up @@ -65,7 +66,7 @@ func (l *Lexer) ErrorLine(pos int) (int, int, string) {
lineNum, begin, end := l.linePosition(pos)
errorLine := l.input[begin:end]
column := pos - begin + 1
return lineNum, column, errorLine
return lineNum, column, string(errorLine)
}

func (l *Lexer) newToken(tokenType token.TokenType) token.Token {
Expand Down Expand Up @@ -302,6 +303,7 @@ func (l *Lexer) skipWhitespace() {
}
}

// This function will read a rune
func (l *Lexer) readChar() {
if l.readPosition >= len(l.input) {
l.ch = 0
Expand All @@ -322,15 +324,14 @@ func (l *Lexer) Rewind(pos int) {
}
}

func (l *Lexer) peekChar() byte {
func (l *Lexer) peekChar() rune {
if l.readPosition >= len(l.input) {
return 0
} else {
return l.input[l.readPosition]
}
return l.input[l.readPosition]
}

func (l *Lexer) prevChar(steps int) byte {
func (l *Lexer) prevChar(steps int) rune {
prevPosition := l.readPosition - steps
if prevPosition < 1 {
return 0
Expand All @@ -343,7 +344,7 @@ func (l *Lexer) readIdentifier() string {
for isLetter(l.ch) {
l.readChar()
}
return l.input[position:l.position]
return string(l.input[position:l.position])
}

// 12
Expand All @@ -363,7 +364,7 @@ func (l *Lexer) readNumber() (number string, kind token.TokenType) {
// in this number, it means we're at the end of the
// number and we're at an addition / subtraction.
if (l.ch == '+' || l.ch == '-') && !hasExponent {
return l.input[position:l.position], kind
return string(l.input[position:l.position]), kind
}

// If the number contains as 'e',
Expand All @@ -375,13 +376,13 @@ func (l *Lexer) readNumber() (number string, kind token.TokenType) {
// If we have a dot, let's check whether this is a range
// or maybe a method call (122.string())
if l.ch == '.' && (l.peekChar() == '.' || !isDigit(l.peekChar())) {
return l.input[position:l.position], kind
return string(l.input[position:l.position]), kind
}

if l.ch == '.' {
// If we have 2 dots in a number, there's a problem
if hasDot {
return l.input[position : l.position+1], token.ILLEGAL
return string(l.input[position : l.position+1]), token.ILLEGAL
}

hasDot = true
Expand All @@ -392,18 +393,18 @@ func (l *Lexer) readNumber() (number string, kind token.TokenType) {
// If the number ends with the exponent,
// there's a problem.
if l.input[l.position-1] == 'e' {
return l.input[position:l.position], token.ILLEGAL
return string(l.input[position:l.position]), token.ILLEGAL
}

return strings.ReplaceAll(l.input[position:l.position], "_", ""), kind
return strings.ReplaceAll(string(l.input[position:l.position]), "_", ""), kind
}

// A logical operator is 2 chars, so
// we can simply read 2 chars and call
// it a day.
func (l *Lexer) readLogicalOperator() string {
l.readChar()
return l.input[l.position-1 : l.position+1]
return string(l.input[l.position-1 : l.position+1])
}

// Reads a strings from the text.
Expand All @@ -413,7 +414,7 @@ func (l *Lexer) readLogicalOperator() string {
// character itself ("\\").
func (l *Lexer) readString(quote byte) string {
var chars []string
esc := byte('\\')
esc := rune('\\')
doubleEscape := false
for {
l.readChar()
Expand All @@ -422,7 +423,7 @@ func (l *Lexer) readString(quote byte) string {
chars = append(chars, string(esc))
l.readChar()
// be careful here, there may be double escaped LFs in the string
if l.peekChar() == quote {
if l.peekChar() == rune(quote) {
doubleEscape = true
} else {
// this is not a double escaped quote
Expand All @@ -433,7 +434,7 @@ func (l *Lexer) readString(quote byte) string {
// If we encounter an escape, let's check whether
// we're trying to escape a quote. If so, let's skip
// the escape and add the quote to the string.
if l.ch == esc && l.peekChar() == quote {
if l.ch == esc && l.peekChar() == rune(quote) {
chars = append(chars, string(quote))
l.readChar()
continue
Expand All @@ -459,7 +460,7 @@ func (l *Lexer) readString(quote byte) string {
// The string ends when we encounter a quote
// and the character before that was not an escape,
// or the escape was escaped as well ("string\\").
if (l.ch == quote && (l.prevChar(2) != esc || doubleEscape)) || l.ch == 0 {
if (l.ch == rune(quote) && (l.prevChar(2) != esc || doubleEscape)) || l.ch == 0 {
break
}
chars = append(chars, string(l.ch))
Expand All @@ -479,7 +480,7 @@ func (l *Lexer) readLine() string {
break
}
}
return l.input[position:l.position]
return string(l.input[position:l.position])
}

// We want to extract the actual command
Expand Down Expand Up @@ -514,13 +515,13 @@ func (l *Lexer) readCommand() string {
l.readPosition = l.position
}

return ret
return string(ret)
}

func isLetter(ch byte) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
func isLetter(ch rune) bool {
return unicode.IsLetter(ch) || ch == '_'
}

func isDigit(ch byte) bool {
func isDigit(ch rune) bool {
return '0' <= ch && ch <= '9'
}
31 changes: 31 additions & 0 deletions lexer/lexer_test.go
Expand Up @@ -85,6 +85,8 @@ $111
10_000
10_00.00
1_2e1
小明
`

tests := []struct {
Expand Down Expand Up @@ -304,6 +306,8 @@ $111
{token.NUMBER, "10000"},
{token.NUMBER, "1000.00"},
{token.NUMBER, "12e1"},
{token.IDENT, "小明"},
{token.ILLEGAL, "❤"},
{token.EOF, ""},
}

Expand Down Expand Up @@ -440,3 +444,30 @@ func TestCurrentPosition(t *testing.T) {
t.Fatalf("wrong position. expected=%d, got=%d", 6, l.CurrentPosition())
}
}

func TestUnicode(t *testing.T) {
input := `世界 = "⺐ ❤ 😄"`
l := New(input)

tests := []struct {
expectedType token.TokenType
expectedLiteral string
}{
{token.IDENT, "世界"},
{token.ASSIGN, "="},
{token.STRING, "⺐ ❤ 😄"},
{token.EOF, ""},
}

for i, tt := range tests {
tok := l.NextToken()

if tok.Type != tt.expectedType {
t.Fatalf("tests[%d] - tokentype wrong. expected=%q, got=%q", i, tt.expectedType, tok.Type)
}

if tok.Literal != tt.expectedLiteral {
t.Fatalf("tests[%d] - literal wrong. expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal)
}
}
}

0 comments on commit 8db4194

Please sign in to comment.