In [523]:
from EduNLP.Tokenizer import CharTokenizer, SpaceTokenizer, CustomTokenizer, PureTextTokenizer, AstFormulaTokenizer, get_tokenizer


# Tokenizers

The basic tokenization containers currently available include:

- CharTokenizer
- SpaceTokenizer
- CustomTokenizer
- PureTextTokenizer
- AstFormulaTokenizer

Here is the detailed introduction for each of them.

## CustomTokenizer

- `custom` Tokenizer can tokenize SIF items by customized configuration. The supported custom content includes: pre-tokenized content, image attribute information, etc. You can specify the stop words in order to filter a subset of content

- This tokenizer works almost the same as the pure text tokenizer

In [524]:
items = [{
        "stem": "文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？",
        "options": ["1", "2"]
        }]
tokenizer = get_tokenizer("custom", symbol='f')

tokens = tokenizer(items, key = lambda x: x['stem'])
print(next(tokens))


['文具店', '[FORMULA]', '练习本', '卖出', '剩', '[FORMULA]', '包', '每包', '[FORMULA]', '卖出']


In [525]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("custom", symbol='t')

tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))

['[TEXT]', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', '[TEXT]', 'A', '\\cap', 'B', '=']


## CharTokenizer

- `char` Tokenizer extracts each word of the text individually as a separate token. You can specify the stop words in order to filter a subset of content
    

In [526]:
items = [{
        "stem": "文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？",
        "options": ["1", "2"]
        }]
tokenizer = get_tokenizer("char", stop_words = set("，？"))

tokens = tokenizer(items, key = lambda x: x['stem'])
print(next(tokens))


['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本']


In [527]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("char")
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))

['已', '知', '集', '合', '$', 'A', '=', '\\', 'left', '\\', '{', 'x', '\\', 'mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\', 'right', '\\', '}', ',', '\\', 'quad', 'B', '=', '\\', '{', '-', '4', ',', '1', ',', '3', ',', '5', '\\', '}', ',', '\\', 'quad', '$', '则', '$', 'A', '\\', 'cap', 'B', '=', '$']


## SpaceTokenizer

- `space` Tokenizer splits the text based on the space. You can specify the stop words in order to filter a subset of content
    

In [528]:
items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']

tokenizer = get_tokenizer("space", stop_words = [])
tokens= tokenizer(items)

print(next(tokens))


['文具店有', '$600$', '本练习本，卖出一些后，还剩', '$4$', '包，每包', '$25$', '本，卖出多少本？']


In [529]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("space", stop_words = [])
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))

['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$']


## PureTextTokenizer
- `pure_text` Tokenizer treats all the elements in SIF item as prue text. Spectially, it will tokenize formulas as text. Besides, excess content can be discarded. The tokens will be a collection of simple words

In [530]:
items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']

tokenizer = get_tokenizer("pure_text", stop_words = [])
tokens= tokenizer(items)

print(next(tokens))

['文具店', '600', '练习本', '卖出', '剩', '4', '包', '每包', '25', '卖出']


In [531]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("pure_text", stop_words = [])
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))

['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']


In [532]:
items = ["有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"]

tokenizer = get_tokenizer("pure_text") 
tokens = tokenizer(items)
print(next(tokens))

['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']


## AstFormulaTokenizer

- `ast_formula` Tokenizer abstracts the mathematical formulas in text. For example, variables that appear will be recorded and tokenized in turn, and objects such as expressions and pictures will be preprocessed as tokens. You can specify the stop words in order to filter a subset of content
    

In [533]:
items = ['文具店有 $600$ 本练习本，卖出一些后，还剩 $4$ 包，每包 $25$ 本，卖出多少本？']

tokenizer = get_tokenizer("ast_formula")
tokens= tokenizer(items)

print(next(tokens))

['文具店', 'textord', 'textord', 'textord', '练习本', '卖出', '剩', 'textord', '包', '每包', 'textord', 'textord', '卖出']


In [534]:
items = [{
        "stem": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$",
        "options": ["1", "2"]
        }]

tokenizer = get_tokenizer("ast_formula")
tokens = tokenizer(items, key=lambda x: x["stem"])
print(next(tokens))

['已知', '集合', 'mathord_0', '=', 'mathord_1', '\\mid', 'mathord_1', 'textord', '{ }', '\\supsub', '-', 'textord', 'mathord_1', '-', 'textord', '<', 'textord', '\\{', ',', 'mathord_2', '=', '\\{', '-', 'textord', ',', 'textord', ',', 'textord', ',', 'textord', '\\}', ',', 'mathord_0', '\\cap', 'mathord_2', '=']


In [535]:
items = ["有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$,若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"]

tokenizer = get_tokenizer("ast_formula") 
tokens = tokenizer(items)
print(next(tokens))

['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]']


# GensimWordTokenizer and GensimSegTokenizer

- GensimWordTokenizer is the standart basic Tokenizer for SIF items

- GensimSegTokenizer is the standart basic Tokenizer for SIF items

In [536]:
from EduNLP.Pretrain import GensimWordTokenizer, GensimSegTokenizer

## GensimWordTokenizer

In [537]:
item = "已知有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$"

tokenizer = GensimWordTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(token_item.tokens)
print()

tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
token_item = tokenizer(item)
print(token_item.tokens)

['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], '[SEP]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']

['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']


## GensimSegTokenizer

In [538]:
item = "已知有公式$\\FormFigureID{1}$，如图$\\FigureID{088f15ea-xxx}$, 若$x,y$满足约束条件公式$\\FormFigureBase64{2}$，$\\SIFSep$则$z=x+7 y$的最大值为$\\SIFBlank$"

tokenizer = GensimSegTokenizer(symbol="gmas")
token_item = tokenizer(item)
print(len(token_item), token_item)
print()

tokenizer = GensimSegTokenizer(symbol="gmas", flatten=True)
token_item = tokenizer(item)
token_item = [i for i in token_item]
print(len(token_item), token_item)
print()

# segment at Tag and Sep 
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2)
token_item = tokenizer(item)
print(len(token_item), token_item)
print()

# tag for texts and formulas in each big segment if setting depth.
tokenizer = GensimSegTokenizer(symbol="gmas", depth=2,  add_seg_mode="delimiter")
token_item = tokenizer(item)
print(len(token_item), token_item)

10 [['已知', '公式'], [\FormFigureID{1}], ['如图'], ['[FIGURE]'], ['mathord', ',', 'mathord'], ['约束条件', '公式'], [[FORMULA]], ['mathord', '=', 'mathord', '+', 'textord', 'mathord'], ['最大值'], ['[MARK]']]

19 ['已知', '公式', \FormFigureID{1}, '如图', '[FIGURE]', 'mathord', ',', 'mathord', '约束条件', '公式', [FORMULA], 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '最大值', '[MARK]']

5 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[TEXT_BEGIN]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[TEXT_BEGIN]', '[SEP]'], ['[TEXT_BEGIN]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord'], ['[TEXT_BEGIN]', '最大值'], ['[MARK]']]

5 [['[TEXT_BEGIN]', '已知', '公式', '[TEXT_END]', '[FORMULA_BEGIN]', \FormFigureID{1}, '[FORMULA_END]', '[TEXT_BEGIN]', '如图', '[TEXT_END]', '[FIGURE]', '[TEXT_BEGIN]', '[TEXT_END]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[FORMULA_END]', '[TEX