pkg/schemadsl/lexer/lex_def.go

//go:generate go run golang.org/x/tools/cmd/stringer -type=TokenType

package lexer

import (
	"unicode"

	"github.com/authzed/spicedb/pkg/schemadsl/input"
)

// Lex creates a new scanner for the input string.
func Lex(source input.Source, input string) *Lexer {
	return createLexer(source, input)
}

// TokenType identifies the type of lexer lexemes.
type TokenType int

const (
	TokenTypeError TokenType = iota // error occurred; value is text of error

	// Synthetic semicolon
	TokenTypeSyntheticSemicolon

	TokenTypeEOF
	TokenTypeWhitespace
	TokenTypeSinglelineComment
	TokenTypeMultilineComment
	TokenTypeNewline

	TokenTypeKeyword    // interface
	TokenTypeIdentifier // helloworld
	TokenTypeNumber     // 123

	TokenTypeLeftBrace  // {
	TokenTypeRightBrace // }
	TokenTypeLeftParen  // (
	TokenTypeRightParen // )

	TokenTypePipe  // |
	TokenTypePlus  // +
	TokenTypeMinus // -
	TokenTypeAnd   // &
	TokenTypeDiv   // /

	TokenTypeEquals     // =
	TokenTypeColon      // :
	TokenTypeSemicolon  // ;
	TokenTypeRightArrow // ->
	TokenTypeHash       // #
	TokenTypeEllipsis   // ...
	TokenTypeStar       // *

	// Additional tokens for CEL: https://github.com/google/cel-spec/blob/master/doc/langdef.md#syntax
	TokenTypeQuestionMark       // ?
	TokenTypeConditionalOr      // ||
	TokenTypeConditionalAnd     // &&
	TokenTypeExclamationPoint   // !
	TokenTypeLeftBracket        // [
	TokenTypeRightBracket       // ]
	TokenTypePeriod             // .
	TokenTypeComma              // ,
	TokenTypePercent            // %
	TokenTypeLessThan           // <
	TokenTypeGreaterThan        // >
	TokenTypeLessThanOrEqual    // <=
	TokenTypeGreaterThanOrEqual // >=
	TokenTypeEqualEqual         // ==
	TokenTypeNotEqual           // !=
	TokenTypeString             // "...", '...', """...""", '''...'''
)

// keywords contains the full set of keywords supported.
var keywords = map[string]struct{}{
	"definition": {},
	"caveat":     {},
	"relation":   {},
	"permission": {},
	"nil":        {},
	"with":       {},
}

// IsKeyword returns whether the specified input string is a reserved keyword.
func IsKeyword(candidate string) bool {
	_, ok := keywords[candidate]
	return ok
}

// syntheticPredecessors contains the full set of token types after which, if a newline is found,
// we emit a synthetic semicolon rather than a normal newline token.
var syntheticPredecessors = map[TokenType]bool{
	TokenTypeIdentifier: true,
	TokenTypeKeyword:    true,

	TokenTypeRightBrace: true,
	TokenTypeRightParen: true,

	TokenTypeStar: true,
}

// lexerEntrypoint scans until EOFRUNE
func lexerEntrypoint(l *Lexer) stateFn {
Loop:
	for {
		switch r := l.next(); {
		case r == EOFRUNE:
			break Loop

		case r == '{':
			l.emit(TokenTypeLeftBrace)

		case r == '}':
			l.emit(TokenTypeRightBrace)

		case r == '(':
			l.emit(TokenTypeLeftParen)

		case r == ')':
			l.emit(TokenTypeRightParen)

		case r == '+':
			l.emit(TokenTypePlus)

		case r == '|':
			if l.acceptString("|") {
				l.emit(TokenTypeConditionalOr)
			} else {
				l.emit(TokenTypePipe)
			}

		case r == '&':
			if l.acceptString("&") {
				l.emit(TokenTypeConditionalAnd)
			} else {
				l.emit(TokenTypeAnd)
			}

		case r == '?':
			l.emit(TokenTypeQuestionMark)

		case r == '!':
			if l.acceptString("=") {
				l.emit(TokenTypeNotEqual)
			} else {
				l.emit(TokenTypeExclamationPoint)
			}

		case r == '[':
			l.emit(TokenTypeLeftBracket)

		case r == ']':
			l.emit(TokenTypeRightBracket)

		case r == '%':
			l.emit(TokenTypePercent)

		case r == '<':
			if l.acceptString("=") {
				l.emit(TokenTypeLessThanOrEqual)
			} else {
				l.emit(TokenTypeLessThan)
			}

		case r == '>':
			if l.acceptString("=") {
				l.emit(TokenTypeGreaterThanOrEqual)
			} else {
				l.emit(TokenTypeGreaterThan)
			}

		case r == ',':
			l.emit(TokenTypeComma)

		case r == '=':
			if l.acceptString("=") {
				l.emit(TokenTypeEqualEqual)
			} else {
				l.emit(TokenTypeEquals)
			}

		case r == ':':
			l.emit(TokenTypeColon)

		case r == ';':
			l.emit(TokenTypeSemicolon)

		case r == '#':
			l.emit(TokenTypeHash)

		case r == '*':
			l.emit(TokenTypeStar)

		case r == '.':
			if l.acceptString("..") {
				l.emit(TokenTypeEllipsis)
			} else {
				l.emit(TokenTypePeriod)
			}

		case r == '-':
			if l.accept(">") {
				l.emit(TokenTypeRightArrow)
			} else {
				l.emit(TokenTypeMinus)
			}

		case isSpace(r):
			l.emit(TokenTypeWhitespace)

		case isNewline(r):
			// If the previous token matches the synthetic semicolon list,
			// we emit a synthetic semicolon instead of a simple newline.
			if _, ok := syntheticPredecessors[l.lastNonIgnoredToken.Kind]; ok {
				l.emit(TokenTypeSyntheticSemicolon)
			} else {
				l.emit(TokenTypeNewline)
			}

		case isAlphaNumeric(r):
			l.backup()
			return lexIdentifierOrKeyword

		case r == '\'' || r == '"':
			l.backup()
			return lexStringLiteral

		case r == '/':
			// Check for comments.
			if l.peekValue("/") {
				l.backup()
				return lexSinglelineComment
			}

			if l.peekValue("*") {
				l.backup()
				return lexMultilineComment
			}

			l.emit(TokenTypeDiv)
		default:
			return l.errorf(r, "unrecognized character at this location: %#U", r)
		}
	}

	l.emit(TokenTypeEOF)
	return nil
}

// lexStringLiteral scan until the close of the string literal or EOFRUNE
func lexStringLiteral(l *Lexer) stateFn {
	allowNewlines := false
	terminator := ""

	if l.acceptString(`"""`) {
		terminator = `"""`
		allowNewlines = true
	} else if l.acceptString(`'''`) {
		terminator = `"""`
		allowNewlines = true
	} else if l.acceptString(`"`) {
		terminator = `"`
	} else if l.acceptString(`'`) {
		terminator = `'`
	}

	for {
		if l.peekValue(terminator) {
			l.acceptString(terminator)
			l.emit(TokenTypeString)
			return lexSource
		}

		// Otherwise, consume until we hit EOFRUNE.
		r := l.next()
		if !allowNewlines && isNewline(r) {
			return l.errorf(r, "Unterminated string")
		}

		if r == EOFRUNE {
			return l.errorf(r, "Unterminated string")
		}
	}
}

// lexSinglelineComment scans until newline or EOFRUNE
func lexSinglelineComment(l *Lexer) stateFn {
	checker := func(r rune) (bool, error) {
		result := r == EOFRUNE || isNewline(r)
		return !result, nil
	}

	l.acceptString("//")
	return buildLexUntil(TokenTypeSinglelineComment, checker)
}

// lexMultilineComment scans until the close of the multiline comment or EOFRUNE
func lexMultilineComment(l *Lexer) stateFn {
	l.acceptString("/*")
	for {
		// Check for the end of the multiline comment.
		if l.peekValue("*/") {
			l.acceptString("*/")
			l.emit(TokenTypeMultilineComment)
			return lexSource
		}

		// Otherwise, consume until we hit EOFRUNE.
		r := l.next()
		if r == EOFRUNE {
			return l.errorf(r, "Unterminated multiline comment")
		}
	}
}

// lexIdentifierOrKeyword searches for a keyword or literal identifier.
func lexIdentifierOrKeyword(l *Lexer) stateFn {
	for {
		if !isAlphaNumeric(l.peek()) {
			break
		}

		l.next()
	}

	_, isKeyword := keywords[l.value()]

	switch {
	case isKeyword:
		l.emit(TokenTypeKeyword)

	default:
		l.emit(TokenTypeIdentifier)
	}

	return lexSource
}

// isSpace reports whether r is a space character.
func isSpace(r rune) bool {
	return r == ' ' || r == '\t'
}

// isNewline reports whether r is a newline character.
func isNewline(r rune) bool {
	return r == '\r' || r == '\n'
}

// isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
func isAlphaNumeric(r rune) bool {
	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}