# Lexical Analyser

Owner: André Geraldo 

Course: Computer Enginnering

Steps:

1- Read a file containing a source code with a specific syntax

2 - User a buffer to take each line and classify them 

3 - Regex will recognise if a token is valid and store them

4 - store each token in the the following format:
 [Token, Lexema, Line, Column]

In [34]:
#import of regex
from typing import NamedTuple
import re

class Token(NamedTuple):
  type: str
  lexema: str
  line: int
  col: int

class LexicalAnalyser:

  def __init__(self, source_code):
    self.__keywords = {
        'BeginFun','EndFun','if','then','else','elif',
        'end','funLoopWhile','do','endFunLoop',
        'showMeTheCode','grabInput','funny'
        }
    self.__token_pair = [
        ('TK_NUM', r'\d+'),
        ('TK_ATRIB', r'<-'),
        ('TK_PERIOD', r'\.'),
        ('TK_ID', r'[A-Za-z]([A-Za-z]|\d|_)*'),
        ('TK_STRING', r'^\"([A-Za-z]|\d|\.|%|:| )*\"$'), #String com aspas
        ('TK_OP_AR', r'[-+*\/]'), #Operações aritmética
        ('TK_OP_RE', r'<>|[=<>]'),#Operação relacional  
        ('TK_BOOL', r'[|&]'),       
        ('TK_OPEN_P', r'\('),
        ('TK_CLOSE_P', r'\)'),
        ('TK_COMMA', r','),
        ('TK_NEW_LINE',r'\n'),
        ('TK_SKIP', r'[\ \t\r]+'),
        ('MISMATCH', r'.'),                
      ]                            
    self.__source_code = source_code

  def scanner(self):
    
    token_base = self.__token_pair
    buffer = self.__source_code
    #Cria grupos com o nome de cada par de token no formato: (?P<nomeTk>regex)
    #Concatena cada par com '|' que vai servir como OU em passos seguintes
    regex_rules = '|'.join('(?P<%s>%s)' % strPair for strPair in token_base)
    line = 1
    col_start = 0

    #tkType: Pega o nome do padrão reconhecido mais recente. Dependendo 
    #do padrão existem algumas correções antes de salvar como um token
    #lexema: pega o valor extraído quando o padrão é reconhecido
    #column: calcula o valor inicial da coluno do token encontrado

    for matchedPattern in re.finditer(regex_rules, buffer):
      tkType = matchedPattern.lastgroup
      lexema = matchedPattern.group()
      column = matchedPattern.start()+1 - col_start
      #print("LASTGROUP: ", kind)
      #print("\nVALUE: ", mo.group())
      #print("\nCOLLUMN: ", mo.start() - line_start)
      if tkType == 'TK_NUM':
        lexema = int(lexema)
      elif lexema in self.__keywords :
        tkType = lexema
      elif tkType == 'TK_NEW_LINE':
        col_start = matchedPattern.end()
        line += 1
        continue
      elif tkType == 'TK_SKIP':
        continue
      elif tkType == 'MISMATCH':
        raise RuntimeError(f'{tkType!r} valor inesperado na linha {line}')
      yield Token(tkType,lexema,line, column)
    
  def tokenize(self, tkType, lexema, line, column):
    #Retorna uma função geradora a quem chamou. A execução começa
    #apenas quando o gerador é iterado
    #Vantagem: Nenhuma memória é alocada quando o yield é usado
    #O parâmetro é uma tupla nomeada (NamedTuple)
    yield Token(tkType, lexema, line, column)
  
  def generateToken():
    scanner()



def main():
  #read the current fun lang file in test.lang
  # and store it in a buffer

  content = ""
  with open("lang.txt", "r") as file:
    content = file.read()

  lexico = LexicalAnalyser(content).scanner()

  for token in lexico:
    print(token)
main()

Token(type='BeginFun', lexema='BeginFun', line=1, col=1)
Token(type='if', lexema='if', line=2, col=2)
Token(type='TK_ID', lexema='idade', line=2, col=5)
Token(type='TK_OP_RE', lexema='<', line=2, col=11)
Token(type='TK_NUM', lexema=10, line=2, col=13)
Token(type='TK_BOOL', lexema='&', line=2, col=16)
Token(type='TK_ID', lexema='anoNasc', line=2, col=18)
Token(type='TK_OP_RE', lexema='<>', line=2, col=26)
Token(type='TK_NUM', lexema=10, line=2, col=29)
Token(type='then', lexema='then', line=2, col=32)
Token(type='funLoopWhile', lexema='funLoopWhile', line=3, col=2)
Token(type='TK_ID', lexema='valor_ethereum', line=3, col=15)
Token(type='TK_OP_RE', lexema='<', line=3, col=30)
Token(type='TK_ID', lexema='valor_bitcoin', line=3, col=32)
Token(type='do', lexema='do', line=3, col=46)
Token(type='showMeTheCode', lexema='showMeTheCode', line=4, col=3)
Token(type='TK_ID', lexema='investYourMoney', line=4, col=17)
Token(type='TK_PERIOD', lexema='.', line=4, col=32)
Token(type='endFunLoop', lexem

In [None]:
import re

text_to_search = '''
BeginFun
  funny variable.
  variable <- 10 * (124+ 666).
EndFun

'''

regexRules = '''

'''
pattern = re.compile(r'\d+')

matches = pattern.finditer(text_to_search)

for match in matches:
  print(match)

print(text_to_search[12:17])

<re.Match object; span=(42, 44), match='10'>
<re.Match object; span=(48, 51), match='124'>
<re.Match object; span=(53, 56), match='666'>
funny


In [None]:
list1 = [('NUMBER', r'\d+'),( 'ASSIGN',  r':=') ]
list2 = [('NOME_TOKEN', r'REGEX_EXP'),('NOME_TOKEN2', r'REGEX_EXP2') ]
token = "|".join('(?P<%s>%s)' % pair for pair in list1)



print(token)



(?P<NUMBER>\d+)|(?P<ASSIGN>:=)


In [13]:
from typing import NamedTuple
import re


class Token(NamedTuple):
  type: str
  value: str
  line: int
  column: int


def tokenize(code):
  keywords = {'BeginFun','EndFun','if','then','else','elif','end','funLoopWhile','do','endFunLoop',
              'showMeTheCode','grabInput','funny'}
  token_specification = [
    ('TK_NUM', r'\d+'),
    ('TK_ATRIB', r'<-'),
    ('TK_PERIOD', r'\.'),
    ('TK_ID', r'[A-Za-z]([A-Za-z]|\d|_)*'),
    ('TK_STRING', r'^\"([A-Za-z]|\d|\.|%|:| )*\"$'), #String com aspas
    ('TK_OP_AR', r'[-+*\/]'), #Oerações aritmética
    ('TK_OP_RE', r'[=<>]|<>'),#Operação relacional  
    ('TK_BOOL', r'[|&]'),       
    ('TK_OPEN_P', r'\('),
    ('TK_CLOSE_P', r'\)'),
    ('TK_COMMA', r','),
    ('TK_NEW_LINE',r'\n'),
    ('TK_SKIP', r'[\ \t\r]+'),
    ('MISMATCH', r'.'),                
  ]            

  tk_regex_rules = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)

  line_num = 1
  line_start = 0
  for mo in re.finditer(tk_regex_rules,code):
    kind = mo.lastgroup
    value = mo.group()
    column = mo.start() - line_start
    #print("LASTGROUP: ", kind)
     #print("\nVALUE: ", mo.group())
    #print("\nCOLLUMN: ", mo.start() - line_start)
    if kind == 'TK_NUM':
      value = int(value)
    elif value in keywords :
      kind = value
    elif kind == 'TK_NEW_LINE':
      line_start = mo.end()
      line_num += 1
      continue
    elif kind == 'TK_SKIP':
      continue
    elif kind == 'MISMATCH':
      raise RuntimeError(f'{value!r} unexpected on line {line_num}')
    yield Token(kind,value, line_num, column)

    
code_source = '''
BeginFun

 if idade < 10 & anoNasc <> 10 then

	funLoopWhile valor_ethereum < valor_bitcoin do

		showMeTheCode investYourMoney.

	endFunLoop.

 end.
 
EndFun
'''
#tokenize(code_source)
for token in tokenize(code_source):
  print(token)


Token(type='BeginFun', value='BeginFun', line=2, column=0)
Token(type='if', value='if', line=4, column=1)
Token(type='TK_ID', value='idade', line=4, column=4)
Token(type='TK_OP_RE', value='<', line=4, column=10)
Token(type='TK_NUM', value=10, line=4, column=12)
Token(type='TK_BOOL', value='&', line=4, column=15)
Token(type='TK_ID', value='anoNasc', line=4, column=17)
Token(type='TK_OP_RE', value='<', line=4, column=25)
Token(type='TK_OP_RE', value='>', line=4, column=26)
Token(type='TK_NUM', value=10, line=4, column=28)
Token(type='then', value='then', line=4, column=31)
Token(type='funLoopWhile', value='funLoopWhile', line=6, column=1)
Token(type='TK_ID', value='valor_ethereum', line=6, column=14)
Token(type='TK_OP_RE', value='<', line=6, column=29)
Token(type='TK_ID', value='valor_bitcoin', line=6, column=31)
Token(type='do', value='do', line=6, column=45)
Token(type='showMeTheCode', value='showMeTheCode', line=8, column=2)
Token(type='TK_ID', value='investYourMoney', line=8, column=

In [None]:
def testYield():
  yield "Welcome to Brazil !"
output = testYield()
print(list(output))

['Welcome to Brazil !']


In [None]:
def getFibonnaciSeries(num):
    c1, c2 = 0, 1
    count = 0
    while count < num:
        yield c1
        c3 = c1 + c2
        c1 = c2
        c2 = c3
        count += 1
fin = getFibonnaciSeries(7)
print(fin)
#for i in fin:
  #  print(i)

<generator object getFibonnaciSeries at 0x7f66e9d3fa50>


In [None]:
from collections import namedtuple

#declaring namedtuple()
Student = namedtuple('Student',['name','age','DOB'])

#puting values
S = Student('Nandini','19','2532423')

#Access using index
# print("The Student age using index is: ", end = "")

Point2D = namedtuple('Point2D',['x','y'])
print(Point2D.)

<property object at 0x7f66e9d12d10>


In [None]:
from typing import NamedTuple

class Transaction(NamedTuple):
  sender: str
  receiver: str
  date: str
  amount: float

def makeTransaction():
  yield Transaction("me","him","18/09/2020",300.00)

transaction = makeTransaction()
print(transaction)

print(next(transaction))

<generator object makeTransaction at 0x7f66e9d23dd0>
Transaction(sender='me', receiver='him', date='18/09/2020', amount=300.0)
