Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0491e8d
[fix] wrong index in edges
tswsxk May 29, 2021
383b369
[feat] automatically katex updating
tswsxk May 30, 2021
709f915
[feat] switch watex to katex
tswsxk May 30, 2021
a685368
[fix] some potential errors
tswsxk May 30, 2021
c80cb4f
[feat] enable lazy initialization
tswsxk May 30, 2021
2c754ad
[perf] accelerate tokenization
tswsxk May 30, 2021
f038db4
[feat] enable traversal strategy selection
tswsxk May 30, 2021
3ffa250
[docs] remove an old example
tswsxk May 30, 2021
2738065
[feat] add error handling
tswsxk May 30, 2021
052a03f
[feat] rename error to errors
tswsxk May 30, 2021
727ed2f
[feat] add log module
tswsxk May 30, 2021
5bfcede
[feat] export global logger
tswsxk May 30, 2021
39387eb
[feat] export global logger
tswsxk May 30, 2021
c020483
[feat] enable tagged string where \SIFTag is used
tswsxk May 30, 2021
6063f40
[feat] SIFAnn -> SIFTag
tswsxk May 30, 2021
668da6b
[feat] SIFAnn -> SIFTag
tswsxk May 30, 2021
f7da510
[docs] example for word to vector
tswsxk May 30, 2021
41cc4c7
[docs] update example
tswsxk May 30, 2021
9ba8416
[test] ignore scripts directory
tswsxk May 30, 2021
51bd85d
[chore] add dependencies
tswsxk May 30, 2021
95a9bda
[feat] add special tokens
tswsxk May 30, 2021
e5b34bf
[feat] add tools for training and using word to vector
tswsxk May 30, 2021
20f11bf
[feat] add TagSegment
tswsxk May 30, 2021
ce9a2cc
[feat] Annotation -> Tag
tswsxk May 30, 2021
ca29156
[feat] handle TagSegment
tswsxk May 30, 2021
89dab57
[feat] add annotations
tswsxk May 30, 2021
63f2e3c
[test] add tests for training and loading w2v
tswsxk May 30, 2021
1c8fd85
[fix] change test case
tswsxk May 30, 2021
74eca2f
[chore] expand dev and test dep
tswsxk May 30, 2021
c4a9b2c
[feat] add \SIFSep
tswsxk May 31, 2021
7d017f2
[test] fix test cases
tswsxk May 31, 2021
b51a4e8
[chore] change dependencies
tswsxk May 31, 2021
37335ed
[feat] add tag symbol
tswsxk May 31, 2021
58f4397
[docs] examples for item to vector
tswsxk May 31, 2021
a637806
[docs] add examples
tswsxk May 31, 2021
5d59aa9
[fix] escape \s warning
tswsxk May 31, 2021
4dfa616
[test] exception
tswsxk May 31, 2021
4653dcd
[feat] enable dict type handling
tswsxk May 31, 2021
e028e00
[feat] add D2V model
tswsxk May 31, 2021
54d80e7
[feat] add D2V model
tswsxk May 31, 2021
6b74d7d
[feat] add link_vars parameter which can be used to accelerate the pa…
tswsxk May 31, 2021
3eb6f5b
[fix] invalid SEP symbol
tswsxk May 31, 2021
ba2b787
[feat] add GensimSegTokenizer (not test yet)
tswsxk May 31, 2021
1badfa0
[feat] add GensimSegTokenizer
tswsxk May 31, 2021
2cfaf0d
[feat] enable not linking variable when var_numbering is not required
tswsxk May 31, 2021
f183c6a
[fix] ignore some unnecessary features
tswsxk May 31, 2021
381517c
[feat] ignore some unnecessary features
tswsxk May 31, 2021
7e598af
[docs] update examples
tswsxk May 31, 2021
b49f631
[docs] record changes
tswsxk May 31, 2021
c7debb5
[feat] ignore some unnecessary features
tswsxk May 31, 2021
6afc5af
[feat] ignore some developing functions
tswsxk May 31, 2021
a6bda3f
[test] coverage
tswsxk May 31, 2021
5924f2b
[test] add more test cases
tswsxk May 31, 2021
25f9b66
[feat] add seg_token example
tswsxk May 31, 2021
cba7160
[refactor] move location
tswsxk May 31, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGE.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
v0.0.3:
1. update formula ast: supporting more symbols and functions defined in katex
2. add item to vector tools, including word2vec and doc2vec using gensim
3. sci4sif support tokenization grouped by segments
4. add special tokens: \SIFTag and \SIFSep

v0.0.2:
1. fix potential ModuleNotFoundError

Expand Down
18 changes: 10 additions & 8 deletions EduNLP/Formula/Formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Formula(object):
"""

def __init__(self, formula: (str, List[Dict]), variable_standardization=False, const_mathord=None,
*args, **kwargs):
init=True, *args, **kwargs):
"""

Parameters
Expand All @@ -41,16 +41,18 @@ def __init__(self, formula: (str, List[Dict]), variable_standardization=False, c
latex formula string or the parsed abstracted syntax tree
variable_standardization
const_mathord
init
args
kwargs
"""
self._formula = formula
self._ast = None
self.reset_ast(
formula_ensure_str=False,
variable_standardization=variable_standardization,
const_mathord=const_mathord, *args, **kwargs
)
if init is True:
self.reset_ast(
formula_ensure_str=False,
variable_standardization=variable_standardization,
const_mathord=const_mathord, *args, **kwargs
)

def variable_standardization(self, inplace=False, const_mathord=None, variable_connect_dict=None):
const_mathord = const_mathord if const_mathord is not None else CONST_MATHORD
Expand Down Expand Up @@ -217,13 +219,13 @@ def ast_graph(self) -> (nx.Graph, nx.DiGraph):
return tree


def link_formulas(*formula: Formula, **kwargs):
def link_formulas(*formula: Formula, link_vars=True, **kwargs):
forest = []
for form in formula:
forest += form.reset_ast(
forest_begin=len(forest),
**kwargs
)
variable_connect_dict = link_variable(forest)
variable_connect_dict = link_variable(forest) if link_vars else None
for form in formula:
form.variable_standardization(inplace=True, variable_connect_dict=variable_connect_dict, **kwargs)
38 changes: 31 additions & 7 deletions EduNLP/Formula/ast/ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# 2021/5/20 @ tongshiwei
from typing import List, Dict

from .watex import watex
from .katex import katex

__all__ = ["str2ast", "get_edges", "ast", "link_variable"]

Expand Down Expand Up @@ -33,10 +33,11 @@ def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, i
tree: List[Dict]
重新解析形成的特征树

todo: finish all types
"""
tree = []
index += forest_begin
json_ast: List[Dict] = watex.katex.__parse(formula).to_list() if is_str else formula
json_ast: List[Dict] = katex.katex.__parse(formula).to_list() if is_str else formula
last_node = None

for item in json_ast:
Expand Down Expand Up @@ -127,7 +128,8 @@ def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, i
tree += ast([item['base']], index=len(tree) + index, father_tree=tree)

elif tree_node['val']['type'] == "supsub":
item['base']['role'] = 'base'
if item['base'] is not None:
item['base']['role'] = 'base'

if 'sup' in item and item['sup']:
bp = 'sup'
Expand All @@ -143,7 +145,12 @@ def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, i
item[bp]['role'] = bp
tree_node['structure']['child'] = [1 + private_index + index]
tree.append(tree_node)
tree += ast([item['base'], item[bp]], index=len(tree) + index, father_tree=tree)
_tree = []
if item['base'] is not None:
_tree.append(item['base'])
if item[bp] is not None:
_tree.append(item[bp])
tree += ast(_tree, index=len(tree) + index, father_tree=tree)

elif tree_node['val']['type'] == "ordgroup":
tree_node['structure']['child'] = [1 + private_index + index]
Expand All @@ -167,20 +174,37 @@ def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, i
citem['role'] = 'body'
tree += ast(item['body'], index=len(tree) + index, father_tree=tree)

elif tree_node['val']['type'] in {"kern"}:
# \quad
tree_node['val']['text'] = tree_node['val']['type']
tree_node['val']['type'] = "ignore"
tree.append(tree_node)

elif tree_node['val']['type'] == "text":
# \text{}
tree_node['val']['text'] = "".join([e['text'] for e in item["body"]])
tree.append(tree_node)

else:
tree_node['structure']['child'] = [1 + private_index + index]

if "text" in item:
tree_node['val']['text'] = item["text"]
else:
tree_node['val']['text'] = item["type"]
tree_node['val']['type'] = "other"
tree.append(tree_node)
Role = ['body', 'base', 'sup', 'sub', 'numer', 'denom', 'index', 'blew', 'other']
childrole = []

for role_item in Role:
if role_item in item:
item[role_item]['role'] = role_item
childrole.append(item[role_item])
if role_item == "body" and isinstance(item[role_item], dict) is False:
# \text{}
childrole.extend(item[role_item])
else:
item[role_item]['role'] = role_item
childrole.append(item[role_item])
tree += ast(childrole, index=len(tree) + index, father_tree=tree)
if item:
if item != json_ast[0]:
Expand Down Expand Up @@ -247,7 +271,7 @@ def get_edges(forest):
"""
edges = []
for node in forest:
index = forest.index(node)
index = node["val"]["id"]
edges.append((index, index, 1))
if node['structure']['bro'][1] is not None:
edges.append((index, node['structure']['bro'][1], 2))
Expand Down
Loading