ast/file_info.go

// Copyright 2020-2024 Buf Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ast

import (
	"fmt"
	"sort"
	"unicode/utf8"
)

// FileInfo contains information about the contents of a source file, including
// details about comments and items. A lexer accumulates these details as it
// scans the file contents. This allows efficient representation of things like
// source positions.
type FileInfo struct {
	// The name of the source file.
	name string
	// The raw contents of the source file.
	data []byte
	// The offsets for each line in the file. The value is the zero-based byte
	// offset for a given line. The line is given by its index. So the value at
	// index 0 is the offset for the first line (which is always zero). The
	// value at index 1 is the offset at which the second line begins. Etc.
	lines []int
	// The info for every comment in the file. This is empty if the file has no
	// comments. The first entry corresponds to the first comment in the file,
	// and so on.
	comments []commentInfo
	// The info for every lexed item in the file. The last item in the slice
	// corresponds to the EOF, so every file (even an empty one) should have at
	// least one entry. This includes all terminal symbols (tokens) in the AST
	// as well as all comments.
	items []itemSpan
}

type commentInfo struct {
	// the index of the item, in the file's items slice, that represents this
	// comment
	index int
	// the index of the token to which this comment is attributed.
	attributedToIndex int
}

type itemSpan struct {
	// the offset into the file of the first character of an item.
	offset int
	// the length of the item
	length int
}

// NewFileInfo creates a new instance for the given file.
func NewFileInfo(filename string, contents []byte) *FileInfo {
	return &FileInfo{
		name:  filename,
		data:  contents,
		lines: []int{0},
	}
}

func (f *FileInfo) Name() string {
	return f.name
}

// AddLine adds the offset representing the beginning of the "next" line in the file.
// The first line always starts at offset 0, the second line starts at offset-of-newline-char+1.
func (f *FileInfo) AddLine(offset int) {
	if offset < 0 {
		panic(fmt.Sprintf("invalid offset: %d must not be negative", offset))
	}
	if offset > len(f.data) {
		panic(fmt.Sprintf("invalid offset: %d is greater than file size %d", offset, len(f.data)))
	}

	if len(f.lines) > 0 {
		lastOffset := f.lines[len(f.lines)-1]
		if offset <= lastOffset {
			panic(fmt.Sprintf("invalid offset: %d is not greater than previously observed line offset %d", offset, lastOffset))
		}
	}

	f.lines = append(f.lines, offset)
}

// AddToken adds info about a token at the given location to this file. It
// returns a value that allows access to all of the token's details.
func (f *FileInfo) AddToken(offset, length int) Token {
	if offset < 0 {
		panic(fmt.Sprintf("invalid offset: %d must not be negative", offset))
	}
	if length < 0 {
		panic(fmt.Sprintf("invalid length: %d must not be negative", length))
	}
	if offset+length > len(f.data) {
		panic(fmt.Sprintf("invalid offset+length: %d is greater than file size %d", offset+length, len(f.data)))
	}

	tokenID := len(f.items)
	if len(f.items) > 0 {
		lastToken := f.items[tokenID-1]
		lastEnd := lastToken.offset + lastToken.length - 1
		if offset <= lastEnd {
			panic(fmt.Sprintf("invalid offset: %d is not greater than previously observed token end %d", offset, lastEnd))
		}
	}

	f.items = append(f.items, itemSpan{offset: offset, length: length})
	return Token(tokenID)
}

// AddComment adds info about a comment to this file. Comments must first be
// added as items via f.AddToken(). The given comment argument is the Token
// from that step. The given attributedTo argument indicates another token in the
// file with which the comment is associated. If comment's offset is before that
// of attributedTo, then this is a leading comment. Otherwise, it is a trailing
// comment.
func (f *FileInfo) AddComment(comment, attributedTo Token) Comment {
	if len(f.comments) > 0 {
		lastComment := f.comments[len(f.comments)-1]
		if int(comment) <= lastComment.index {
			panic(fmt.Sprintf("invalid index: %d is not greater than previously observed comment index %d", comment, lastComment.index))
		}
		if int(attributedTo) < lastComment.attributedToIndex {
			panic(fmt.Sprintf("invalid attribution: %d is not greater than previously observed comment attribution index %d", attributedTo, lastComment.attributedToIndex))
		}
	}

	f.comments = append(f.comments, commentInfo{index: int(comment), attributedToIndex: int(attributedTo)})
	return Comment{
		fileInfo: f,
		index:    len(f.comments) - 1,
	}
}

// NodeInfo returns details from the original source for the given AST node.
//
// If the given n is out of range, this returns an invalid NodeInfo (i.e.
// nodeInfo.IsValid() returns false). If the given n is not out of range but
// also from a different file than f, then the result is undefined.
func (f *FileInfo) NodeInfo(n Node) NodeInfo {
	return f.nodeInfo(int(n.Start()), int(n.End()))
}

// TokenInfo returns details from the original source for the given token.
//
// If the given t is out of range, this returns an invalid NodeInfo (i.e.
// nodeInfo.IsValid() returns false). If the given t is not out of range but
// also from a different file than f, then the result is undefined.
func (f *FileInfo) TokenInfo(t Token) NodeInfo {
	return f.nodeInfo(int(t), int(t))
}

func (f *FileInfo) nodeInfo(start, end int) NodeInfo {
	if start < 0 || start >= len(f.items) {
		return NodeInfo{fileInfo: f}
	}
	if end < 0 || end >= len(f.items) {
		return NodeInfo{fileInfo: f}
	}
	return NodeInfo{fileInfo: f, startIndex: start, endIndex: end}
}

// ItemInfo returns details from the original source for the given item.
//
// If the given i is out of range, this returns nil. If the given i is not
// out of range but also from a different file than f, then the result is
// undefined.
func (f *FileInfo) ItemInfo(i Item) ItemInfo {
	tok, cmt := f.GetItem(i)
	if tok != TokenError {
		return f.TokenInfo(tok)
	}
	if cmt.IsValid() {
		return cmt
	}
	return nil
}

// GetItem returns the token or comment represented by the given item. Only one
// of the return values will be valid. If the item is a token then the returned
// comment will be a zero value and thus invalid (i.e. comment.IsValid() returns
// false). If the item is a comment then the returned token will be TokenError.
//
// If the given i is out of range, this returns (TokenError, Comment{}). If the
// given i is not out of range but also from a different file than f, then
// the result is undefined.
func (f *FileInfo) GetItem(i Item) (Token, Comment) {
	if i < 0 || int(i) >= len(f.items) {
		return TokenError, Comment{}
	}
	if !f.isComment(i) {
		return Token(i), Comment{}
	}
	// It's a comment, so find its location in f.comments
	c := sort.Search(len(f.comments), func(c int) bool {
		return f.comments[c].index >= int(i)
	})
	if c < len(f.comments) && f.comments[c].index == int(i) {
		return TokenError, Comment{fileInfo: f, index: c}
	}
	// f.isComment(i) returned true, but we couldn't find it
	// in f.comments? Uh oh... that shouldn't be possible.
	return TokenError, Comment{}
}

func (f *FileInfo) isDummyFile() bool {
	return f == nil || f.lines == nil
}

// Sequence represents a navigable sequence of elements.
type Sequence[T any] interface {
	// First returns the first element in the sequence. The bool return
	// is false if this sequence contains no elements. For example, an
	// empty file has no items or tokens.
	First() (T, bool)
	// Next returns the next element in the sequence that comes after
	// the given element. The bool returns is false if there is no next
	// item (i.e. the given element is the last one). It also returns
	// false if the given element is invalid.
	Next(T) (T, bool)
	// Last returns the last element in the sequence. The bool return
	// is false if this sequence contains no elements. For example, an
	// empty file has no items or tokens.
	Last() (T, bool)
	// Previous returns the previous element in the sequence that comes
	// before the given element. The bool returns is false if there is no
	// previous item (i.e. the given element is the first one). It also
	// returns false if the given element is invalid.
	Previous(T) (T, bool)
}

func (f *FileInfo) Items() Sequence[Item] {
	return items{fileInfo: f}
}

func (f *FileInfo) Tokens() Sequence[Token] {
	return tokens{fileInfo: f}
}

type items struct {
	fileInfo *FileInfo
}

func (i items) First() (Item, bool) {
	if len(i.fileInfo.items) == 0 {
		return 0, false
	}
	return 0, true
}

func (i items) Next(item Item) (Item, bool) {
	if item < 0 || int(item) >= len(i.fileInfo.items)-1 {
		return 0, false
	}
	return i.fileInfo.itemForward(item+1, true)
}

func (i items) Last() (Item, bool) {
	if len(i.fileInfo.items) == 0 {
		return 0, false
	}
	return Item(len(i.fileInfo.items) - 1), true
}

func (i items) Previous(item Item) (Item, bool) {
	if item <= 0 || int(item) >= len(i.fileInfo.items) {
		return 0, false
	}
	return i.fileInfo.itemBackward(item-1, true)
}

type tokens struct {
	fileInfo *FileInfo
}

func (t tokens) First() (Token, bool) {
	i, ok := t.fileInfo.itemForward(0, false)
	return Token(i), ok
}

func (t tokens) Next(tok Token) (Token, bool) {
	if tok < 0 || int(tok) >= len(t.fileInfo.items)-1 {
		return 0, false
	}
	i, ok := t.fileInfo.itemForward(Item(tok+1), false)
	return Token(i), ok
}

func (t tokens) Last() (Token, bool) {
	i, ok := t.fileInfo.itemBackward(Item(len(t.fileInfo.items))-1, false)
	return Token(i), ok
}

func (t tokens) Previous(tok Token) (Token, bool) {
	if tok <= 0 || int(tok) >= len(t.fileInfo.items) {
		return 0, false
	}
	i, ok := t.fileInfo.itemBackward(Item(tok-1), false)
	return Token(i), ok
}

func (f *FileInfo) itemForward(i Item, allowComment bool) (Item, bool) {
	end := Item(len(f.items))
	for i < end {
		if allowComment || !f.isComment(i) {
			return i, true
		}
		i++
	}
	return 0, false
}

func (f *FileInfo) itemBackward(i Item, allowComment bool) (Item, bool) {
	for i >= 0 {
		if allowComment || !f.isComment(i) {
			return i, true
		}
		i--
	}
	return 0, false
}

// isComment is comment returns true if i refers to a comment.
// (If it returns false, i refers to a token.)
func (f *FileInfo) isComment(i Item) bool {
	item := f.items[i]
	if item.length < 2 {
		return false
	}
	// see if item text starts with "//" or "/*"
	if f.data[item.offset] != '/' {
		return false
	}
	c := f.data[item.offset+1]
	return c == '/' || c == '*'
}

func (f *FileInfo) SourcePos(offset int) SourcePos {
	lineNumber := sort.Search(len(f.lines), func(n int) bool {
		return f.lines[n] > offset
	})

	// If it weren't for tabs and multibyte unicode characters, we
	// could trivially compute the column just based on offset and the
	// starting offset of lineNumber :(
	// Wish this were more efficient... that would require also storing
	// computed line+column information, which would triple the size of
	// f's items slice...
	col := 0
	for i := f.lines[lineNumber-1]; i < offset; i++ {
		if f.data[i] == '\t' {
			nextTabStop := 8 - (col % 8)
			col += nextTabStop
		} else if utf8.RuneStart(f.data[i]) {
			col++
		}
	}

	return SourcePos{
		Filename: f.name,
		Offset:   offset,
		Line:     lineNumber,
		// Columns are 1-indexed in this AST
		Col: col + 1,
	}
}

// Token represents a single lexed token.
type Token int

// TokenError indicates an invalid token. It is returned from query
// functions when no valid token satisfies the request.
const TokenError = Token(-1)

// AsItem returns the Item that corresponds to t.
func (t Token) AsItem() Item {
	return Item(t)
}

func (t Token) asTerminalNode() terminalNode {
	return terminalNode(t)
}

// Item represents an item lexed from source. It represents either
// a Token or a Comment.
type Item int

// ItemInfo provides details about an item's location in the source file and
// its contents.
type ItemInfo interface {
	SourceSpan
	LeadingWhitespace() string
	RawText() string
}

// NodeInfo represents the details for a node or token in the source file's AST.
// It provides access to information about the node's location in the source
// file. It also provides access to the original text in the source file (with
// all the original formatting intact) and also provides access to surrounding
// comments.
type NodeInfo struct {
	fileInfo             *FileInfo
	startIndex, endIndex int
}

var _ ItemInfo = NodeInfo{}

// IsValid returns true if this node info is valid. If n is a zero-value struct,
// it is not valid.
func (n NodeInfo) IsValid() bool {
	return n.fileInfo != nil
}

// Start returns the starting position of the element. This is the first
// character of the node or token.
func (n NodeInfo) Start() SourcePos {
	if n.fileInfo.isDummyFile() {
		return UnknownPos(n.fileInfo.name)
	}

	tok := n.fileInfo.items[n.startIndex]
	return n.fileInfo.SourcePos(tok.offset)
}

// End returns the ending position of the element, exclusive. This is the
// location after the last character of the node or token. If n returns
// the same position for Start() and End(), the element in source had a
// length of zero (which should only happen for the special EOF token
// that designates the end of the file).
func (n NodeInfo) End() SourcePos {
	if n.fileInfo.isDummyFile() {
		return UnknownPos(n.fileInfo.name)
	}

	tok := n.fileInfo.items[n.endIndex]
	// find offset of last character in the span
	offset := tok.offset
	if tok.length > 0 {
		offset += tok.length - 1
	}
	pos := n.fileInfo.SourcePos(offset)
	if tok.length > 0 {
		// We return "open range", so end is the position *after* the
		// last character in the span. So we adjust
		pos.Col++
	}
	return pos
}

// LeadingWhitespace returns any whitespace prior to the element. If there
// were comments in between this element and the previous one, this will
// return the whitespace between the last such comment in the element. If
// there were no such comments, this returns the whitespace between the
// previous element and the current one.
func (n NodeInfo) LeadingWhitespace() string {
	if n.fileInfo.isDummyFile() {
		return ""
	}

	tok := n.fileInfo.items[n.startIndex]
	var prevEnd int
	if n.startIndex > 0 {
		prevTok := n.fileInfo.items[n.startIndex-1]
		prevEnd = prevTok.offset + prevTok.length
	}
	return string(n.fileInfo.data[prevEnd:tok.offset])
}

// LeadingComments returns all comments in the source that exist between the
// element and the previous element, except for any trailing comment on the
// previous element.
func (n NodeInfo) LeadingComments() Comments {
	if n.fileInfo.isDummyFile() {
		return EmptyComments
	}

	start := sort.Search(len(n.fileInfo.comments), func(i int) bool {
		return n.fileInfo.comments[i].attributedToIndex >= n.startIndex
	})

	if start == len(n.fileInfo.comments) || n.fileInfo.comments[start].attributedToIndex != n.startIndex {
		// no comments associated with this token
		return EmptyComments
	}

	numComments := 0
	for i := start; i < len(n.fileInfo.comments); i++ {
		comment := n.fileInfo.comments[i]
		if comment.attributedToIndex == n.startIndex &&
			comment.index < n.startIndex {
			numComments++
		} else {
			break
		}
	}

	return Comments{
		fileInfo: n.fileInfo,
		first:    start,
		num:      numComments,
	}
}

// TrailingComments returns the trailing comment for the element, if any.
// An element will have a trailing comment only if it is the last token
// on a line and is followed by a comment on the same line. Typically, the
// following comment is a line-style comment (starting with "//").
//
// If the following comment is a block-style comment that spans multiple
// lines, and the next token is on the same line as the end of the comment,
// the comment is NOT considered a trailing comment.
//
// Examples:
//
//	foo // this is a trailing comment for foo
//
//	bar /* this is a trailing comment for bar */
//
//	baz /* this is a trailing
//	       comment for baz */
//
//	fizz /* this is NOT a trailing
//	        comment for fizz because
//	        its on the same line as the
//	        following token buzz */       buzz
func (n NodeInfo) TrailingComments() Comments {
	if n.fileInfo.isDummyFile() {
		return EmptyComments
	}

	start := sort.Search(len(n.fileInfo.comments), func(i int) bool {
		comment := n.fileInfo.comments[i]
		return comment.attributedToIndex >= n.endIndex &&
			comment.index > n.endIndex
	})

	if start == len(n.fileInfo.comments) || n.fileInfo.comments[start].attributedToIndex != n.endIndex {
		// no comments associated with this token
		return EmptyComments
	}

	numComments := 0
	for i := start; i < len(n.fileInfo.comments); i++ {
		comment := n.fileInfo.comments[i]
		if comment.attributedToIndex == n.endIndex {
			numComments++
		} else {
			break
		}
	}

	return Comments{
		fileInfo: n.fileInfo,
		first:    start,
		num:      numComments,
	}
}

// RawText returns the actual text in the source file that corresponds to the
// element. If the element is a node in the AST that encompasses multiple
// items (like an entire declaration), the full text of all items is returned
// including any interior whitespace and comments.
func (n NodeInfo) RawText() string {
	startTok := n.fileInfo.items[n.startIndex]
	endTok := n.fileInfo.items[n.endIndex]
	return string(n.fileInfo.data[startTok.offset : endTok.offset+endTok.length])
}

// SourcePos identifies a location in a proto source file.
type SourcePos struct {
	Filename string
	// The line and column numbers for this position. These are
	// one-based, so the first line and column is 1 (not zero). If
	// either is zero, then the line and column are unknown and
	// only the file name is known.
	Line, Col int
	// The offset, in bytes, from the beginning of the file. This
	// is zero-based: the first character in the file is offset zero.
	Offset int
}

func (pos SourcePos) String() string {
	if pos.Line <= 0 || pos.Col <= 0 {
		return pos.Filename
	}
	return fmt.Sprintf("%s:%d:%d", pos.Filename, pos.Line, pos.Col)
}

// SourceSpan represents a range of source positions.
type SourceSpan interface {
	Start() SourcePos
	End() SourcePos
}

// NewSourceSpan creates a new span that covers the given range.
func NewSourceSpan(start SourcePos, end SourcePos) SourceSpan {
	return sourceSpan{StartPos: start, EndPos: end}
}

type sourceSpan struct {
	StartPos SourcePos
	EndPos   SourcePos
}

func (p sourceSpan) Start() SourcePos {
	return p.StartPos
}

func (p sourceSpan) End() SourcePos {
	return p.EndPos
}

var _ SourceSpan = sourceSpan{}

// Comments represents a range of sequential comments in a source file
// (e.g. no interleaving items or AST nodes).
type Comments struct {
	fileInfo   *FileInfo
	first, num int
}

// EmptyComments is an empty set of comments.
var EmptyComments = Comments{}

// Len returns the number of comments in c.
func (c Comments) Len() int {
	return c.num
}

func (c Comments) Index(i int) Comment {
	if i < 0 || i >= c.num {
		panic(fmt.Sprintf("index %d out of range (len = %d)", i, c.num))
	}
	return Comment{
		fileInfo: c.fileInfo,
		index:    c.first + i,
	}
}

// Comment represents a single comment in a source file. It indicates
// the position of the comment and its contents. A single comment means
// one line-style comment ("//" to end of line) or one block comment
// ("/*" through "*/"). If a longer comment uses multiple line comments,
// each line is considered to be a separate comment. For example:
//
//	// This is a single comment, and
//	// this is a separate comment.
type Comment struct {
	fileInfo *FileInfo
	index    int
}

var _ ItemInfo = Comment{}

// IsValid returns true if this comment is valid. If this comment is
// a zero-value struct, it is not valid.
func (c Comment) IsValid() bool {
	return c.fileInfo != nil && c.index >= 0
}

// AsItem returns the Item that corresponds to c.
func (c Comment) AsItem() Item {
	return Item(c.fileInfo.comments[c.index].index)
}

func (c Comment) Start() SourcePos {
	span := c.fileInfo.items[c.AsItem()]
	return c.fileInfo.SourcePos(span.offset)
}

func (c Comment) End() SourcePos {
	span := c.fileInfo.items[c.AsItem()]
	return c.fileInfo.SourcePos(span.offset + span.length - 1)
}

func (c Comment) LeadingWhitespace() string {
	item := c.AsItem()
	span := c.fileInfo.items[item]
	var prevEnd int
	if item > 0 {
		prevItem := c.fileInfo.items[item-1]
		prevEnd = prevItem.offset + prevItem.length
	}
	return string(c.fileInfo.data[prevEnd:span.offset])
}

func (c Comment) RawText() string {
	span := c.fileInfo.items[c.AsItem()]
	return string(c.fileInfo.data[span.offset : span.offset+span.length])
}