/
tok.py
117 lines (93 loc) · 2.8 KB
/
tok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# 4 kinds of tokens:
# word
# command input
# conjunctor (comma)
# end of sentence (period, optional & ignored)
class Token: # empty base class
pass
class WordToken(Token):
def __init__(self, word: str):
self.word = word
def __repr__(self) -> str:
return f'"{self.word}"'
def __eq__(self, tok: 'any') -> bool:
if type(tok) == WordToken:
return self.word == tok.word
elif type(tok) == str:
return self.word == tok
elif isinstance(tok, Token):
return False
else:
return NotImplemented
def __hash__(self):
return hash(self.word)
class CommandInputToken(Token):
def __init__(self, content: str):
self.content = content
def __repr__(self) -> str:
return f'`{self.content}`'
def __eq__(self, tok: 'any') -> bool:
if type(tok) == CommandInputToken:
# "catch-all" cmd input when content is None
return self.content == tok.content if self.content != None else True
elif isinstance(tok, Token):
return False
else:
return NotImplemented
def __hash__(self):
return hash('cmd')
def placeholder() -> 'CommandInputToken':
return CommandInputToken(None)
class ConjunctorToken(Token):
def __repr__(self) -> str:
return f','
def __eq__(self, tok: 'any') -> bool:
if type(tok) == WordToken:
return tok.word == 'and'
elif isinstance(tok, Token):
return type(tok) == ConjunctorToken
else:
return NotImplemented
def __hash__(self):
return hash(',')
def tokenize(sentence: str) -> 'list of tokens':
tokens = []
i = 0
chars = len(sentence)
while i < chars:
c = sentence[i]
if c == '`':
# command input
cmd_input = ''
i += 1
c = sentence[i]
while i < chars and c != '`':
cmd_input += c
i += 1
if i < chars:
c = sentence[i]
else:
break
token = CommandInputToken(cmd_input)
tokens.append(token)
i += 1
elif c == ',':
# conjunctor
tokens.append(ConjunctorToken())
i += 1
elif c in {'.', ' ', '?'}:
# basically ignore appropriate end-of-sentence indicators and spaces
i += 1
else:
# word
word = ''
while c != ' ':
word += c
i += 1
if i < chars:
c = sentence[i]
else:
break
token = WordToken(word)
tokens.append(token)
return tokens