# Lexical Analysis Using ANTLR

In this assignment, you will be working using ANTLR to perform lexical analysis of a fictitious programming language.

You are required to perform lexical analysis on several test files.

1. Complete mylexer/MyLexer.g5 with the appropriate token types and lexer rules.



2. Complete the Kotlin extension function in Work Unit 1, which is used by `analyze(...)` function given in this worksheet.


Your code is expected to pass all the checkpoints.

We need to load the dependencies into the Kotlin kernel.

1. ANTLR runtime classes that are needed to perform lexical analysis.


2. Your compiled lexer classes.

In [1]:
@file:DependsOn("/data/shared/antlr-4.9.1-complete.jar")
@file:DependsOn(".")

Since the dependencies are loaded into Kotlin, we can import the needed classes.

In [2]:
import org.antlr.v4.runtime.*
import mylexer.MyLexer

## Work Unit 1

You need to extended the Token class with a `print(lexer)` method which will print the following information:

1. token lexeme
2. token type as defined in by the lexer.
3. line number of the token.
4. character position of the token.

Hint: use the `lexer.tokenNames` to look up the token type name.

In [3]:
fun Token.print(lexer: MyLexer) {
    val text = this.text;
    val line = this.line;
    val pos = this.charPositionInLine;
    val type = this.type;
    val typename = if(type < 0) "EOF" else lexer.tokenNames[type];
    println("lexeme=\"${text}\" type=${typename} line=${line} position=${pos}")
}

This function uses your `Token.print` and the lexical analyzer to print out the tokens of a given file.

In [4]:
fun analyze(filename: String) {
    val input = ANTLRFileStream(filename)
    val lexer: MyLexer = MyLexer(input)

    lexer.addErrorListener(object: BaseErrorListener() {
      override fun syntaxError(recognizer: Recognizer<*,*>,
                   offendingSymbol: Any?,
                   line: Int,
                   pos: Int,
                   msg: String,
                   e: RecognitionException) {
          throw Exception("${e} at line:${line}, char:${pos}")
      }
    })
    
    val tokens = CommonTokenStream(lexer)
    
    try {
        tokens.fill()
        tokens.getTokens().forEach {
            it.print(lexer)
        }
    } catch(e: Exception) {
        print(e)
    }
}

# Checkpoints

In [5]:
analyze("./tests/identifiers.src")

lexeme="hello" type=ID line=1 position=0
lexeme="world" type=ID line=2 position=0


lexeme="<EOF>" type=EOF line=3 position=0


In [6]:
analyze("./tests/keywords.src")

lexeme="var" type=KEYWORD line=1 position=0
lexeme="as" type=KEYWORD line=2 position=1


lexeme="return" type=KEYWORD line=3 position=2
lexeme="if" type=KEYWORD line=4 position=3


lexeme="else" type=KEYWORD line=4 position=6
lexeme="<EOF>" type=EOF line=6 position=0


In [7]:
analyze("./tests/nums.src")

lexeme="1" type=NUM line=1 position=0
lexeme="12" type=NUM line=2 position=0


lexeme="123" type=NUM line=3 position=0
lexeme="-1" type=NUM line=5 position=0


lexeme="-12" type=NUM line=6 position=0
lexeme="-123" type=NUM line=7 position=0


lexeme="<EOF>" type=EOF line=8 position=0


In [8]:
analyze("./tests/strings.src")

lexeme=""hello world"" type=STRING line=1 position=0


lexeme=""This course is \"CSCI 4020U\""" type=STRING line=3 position=0


lexeme="""" type=STRING line=5 position=0
lexeme="<EOF>" type=EOF line=7 position=0


In [9]:
analyze("./tests/chars.src")

lexeme="'a'" type=CHAR line=1 position=0
lexeme="'b'" type=CHAR line=2 position=0


lexeme="'d'" type=CHAR line=3 position=0
lexeme="'\\'" type=CHAR line=4 position=0


lexeme="'\n'" type=CHAR line=5 position=0
lexeme="<EOF>" type=EOF line=6 position=0


In [10]:
analyze("./tests/a.src")

lexeme="/* a.src
This is a simple addition of two integers.
*/" type=COMMENT line=1 position=0


lexeme="var" type=KEYWORD line=5 position=0
lexeme="x" type=ID line=5 position=4


lexeme="as" type=KEYWORD line=5 position=6
lexeme="Int" type=ID line=5 position=9


lexeme="(" type='(' line=5 position=12
lexeme="12" type=NUM line=5 position=13


lexeme=")" type=')' line=5 position=15
lexeme=";" type=';' line=5 position=16


lexeme="var" type=KEYWORD line=6 position=0
lexeme="y" type=ID line=6 position=4


lexeme="as" type=KEYWORD line=6 position=6
lexeme="Int" type=ID line=6 position=9


lexeme="(" type='(' line=6 position=12
lexeme="21" type=NUM line=6 position=13


lexeme=")" type=')' line=6 position=15
lexeme=";" type=';' line=6 position=16


lexeme="print" type=ID line=8 position=0
lexeme="(" type='(' line=8 position=5


lexeme="x" type=ID line=8 position=6
lexeme="+" type='+' line=8 position=8


lexeme="y" type=ID line=8 position=10
lexeme=")" type=')' line=8 position=11


lexeme=";" type=';' line=8 position=12
lexeme="<EOF>" type=EOF line=9 position=0


In [11]:
analyze("./tests/b.src")

lexeme="/* b.src
This implements a function that finds the index of a letter in
a text.


*/" type=COMMENT line=1 position=0
lexeme="function" type=KEYWORD line=6 position=0


lexeme="f" type=ID line=6 position=9
lexeme="(" type='(' line=6 position=10


lexeme="text" type=ID line=6 position=11
lexeme="," type=',' line=6 position=15


lexeme="letter" type=ID line=6 position=17
lexeme=")" type=')' line=6 position=23


lexeme="{" type='{' line=6 position=25
lexeme="for" type=ID line=7 position=2


lexeme="(" type='(' line=7 position=5
lexeme="(" type='(' line=7 position=6


lexeme="i" type=ID line=7 position=7
lexeme="," type=',' line=7 position=8


lexeme="c" type=ID line=7 position=9
lexeme=")" type=')' line=7 position=10


lexeme="in" type=ID line=7 position=12
lexeme="enumerate" type=ID line=7 position=15


lexeme="(" type='(' line=7 position=24
lexeme="text" type=ID line=7 position=25


lexeme=")" type=')' line=7 position=29
lexeme=")" type=')' line=7 position=30


lexeme="{" type='{' line=7 position=32
lexeme="if" type=KEYWORD line=8 position=4


lexeme="(" type='(' line=8 position=6
lexeme="c" type=ID line=8 position=7


lexeme="=" type='=' line=8 position=9
lexeme="=" type='=' line=8 position=10


lexeme="letter" type=ID line=8 position=12
lexeme=")" type=')' line=8 position=18


lexeme="{" type='{' line=8 position=20
lexeme="return" type=KEYWORD line=9 position=6


lexeme="i" type=ID line=9 position=13
lexeme=";" type=';' line=9 position=14


lexeme="}" type='}' line=10 position=4
lexeme="}" type='}' line=11 position=2


lexeme="}" type='}' line=12 position=0
lexeme="print" type=ID line=14 position=0


lexeme="(" type='(' line=14 position=5
lexeme="f" type=ID line=14 position=6


lexeme="(" type='(' line=14 position=7
lexeme=""hello world"" type=STRING line=14 position=8


lexeme="," type=',' line=14 position=21
lexeme="'o'" type=CHAR line=14 position=23


lexeme=")" type=')' line=14 position=26
lexeme=")" type=')' line=14 position=27


lexeme=";" type=';' line=14 position=28
lexeme="print" type=ID line=15 position=0


lexeme="(" type='(' line=15 position=5
lexeme="f" type=ID line=15 position=6


lexeme="(" type='(' line=15 position=7
lexeme=""hell\"o\" world"" type=STRING line=15 position=8


lexeme="," type=',' line=15 position=25
lexeme="'"'" type=CHAR line=15 position=27


lexeme=")" type=')' line=15 position=30
lexeme=")" type=')' line=15 position=31


lexeme=";" type=';' line=15 position=32
lexeme="<EOF>" type=EOF line=16 position=0
