Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@

[Meikai Bao](https://github.com/BAOOOOOM)

[Yuting Ning](https://github.com/nnnyt)


The stared contributors are the corresponding authors.
53 changes: 25 additions & 28 deletions EduNLP/Formula/Formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@

class Formula(object):
"""

Parameters
----------
formula: str or List[Dict]
latex formula string or the parsed abstracted syntax tree
variable_standardization
const_mathord
init
args
kwargs

Examples
--------
>>> f = Formula("x")
Expand All @@ -34,21 +45,8 @@ class Formula(object):
--------
the parsed abstracted syntax tree
"""

def __init__(self, formula: (str, List[Dict]), variable_standardization=False, const_mathord=None,
init=True, *args, **kwargs):
"""

Parameters
----------
formula: str or List[Dict]
latex formula string or the parsed abstracted syntax tree
variable_standardization
const_mathord
init
args
kwargs
"""
self._formula = formula
self._ast = None
if init is True:
Expand Down Expand Up @@ -131,6 +129,15 @@ def resetable(self):

class FormulaGroup(object):
"""

Parameters
----------
formula: str or List[Dict] or List[Formula]
latex formula string or the parsed abstracted syntax tree or a group of parsed abstracted syntax tree
variable_standardization
const_mathord
detach

Examples
---------
>>> fg = FormulaGroup(["x + y", "y + x", "z + x"])
Expand All @@ -141,8 +148,9 @@ class FormulaGroup(object):
<FormulaGroup: <Formula: x + y>;<Formula: y + x>;<Formula: z + x>>
>>> fg = FormulaGroup(["x", Formula("y"), "x"])
>>> fg.elements
[{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, {'id': 1, 'type': 'mathord', 'text': 'y', 'role': None},\
{'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}]
[{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None}, \
{'id': 1, 'type': 'mathord', 'text': 'y', 'role': None}, \
{'id': 2, 'type': 'mathord', 'text': 'x', 'role': None}]
>>> fg = FormulaGroup(["x", Formula("y"), "x"], variable_standardization=True)
>>> fg.elements
[{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}, \
Expand All @@ -153,24 +161,12 @@ class FormulaGroup(object):
--------
the parsed abstracted syntax forest
"""

def __init__(self,
formula_list: (list, List[str], List[Formula]),
variable_standardization=False,
const_mathord=None,
detach=True
):
"""

Parameters
----------
formula: str or List[Dict] or List[Formula]
latex formula string or the parsed abstracted syntax tree or a group of parsed abstracted syntax tree
variable_standardization
const_mathord
detach

"""
forest = []
self._formulas = []
for formula in formula_list:
Expand Down Expand Up @@ -261,7 +257,8 @@ def link_formulas(*formula: Formula, link_vars=True, **kwargs):

Parameters
----------
formula:the parsed abstracted syntax tree
formula
the parsed abstracted syntax tree
link_vars
kwargs
"""
Expand Down
78 changes: 44 additions & 34 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class I2V(object):
If you want to get vector from item, you can use other model like D2V and W2V.

Parameters
----------
-----------
tokenizer: str
the tokenizer name
t2v: str
Expand All @@ -26,8 +26,11 @@ class I2V(object):
tokenizer_kwargs: dict
the parameters passed to tokenizer
pretrained_t2v: bool

True: use pretrained t2v model

False: use your own t2v model

kwargs:
the parameters passed to t2v

Expand All @@ -39,33 +42,13 @@ class I2V(object):
>>> model_path = "examples/test_model/test_gensim_luna_stem_tf_d2v_256.bin" # doctest: +ELLIPSIS
>>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) # doctest: +ELLIPSIS
>>> i2v(item) # doctest: +ELLIPSIS
([array([ ...dtype=float32)], None)
([array([...dtype=float32)], None)

Returns
-------
i2v model: I2V
"""

def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs):
"""

Parameters
----------
tokenizer: str
the tokenizer name
t2v: str
the name of token2vector model
args:
the parameters passed to t2v
tokenizer_kwargs: dict
the parameters passed to tokenizer
pretrained_t2v: bool
True: use pretrained t2v model
False: use your own t2v model
kwargs:
the parameters passed to t2v

"""
self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {})
if pretrained_t2v:
logger.info("Use pretrained t2v model %s" % t2v)
Expand Down Expand Up @@ -125,8 +108,11 @@ def vector_size(self):

class D2V(I2V):
"""

Bases: I2V

Parameters
----------
-----------
tokenizer: str
the tokenizer name
t2v: str
Expand All @@ -142,7 +128,7 @@ class D2V(I2V):
the parameters passed to t2v

Examples
--------
---------
>>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \
... 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,\
... 此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$"}
Expand All @@ -160,7 +146,7 @@ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=
'''

Parameters
----------
-----------
items:str
the text of question
tokenize:bool
Expand All @@ -175,7 +161,7 @@ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=
the parameters passed to t2v

Returns
-------
--------
vector:list
'''
tokens = self.tokenize(items, return_token=True, key=key) if tokenize is True else items
Expand All @@ -189,8 +175,11 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):

class W2V(I2V):
"""

Bases: I2V

Parameters
----------
-----------
tokenizer: str
the tokenizer name
t2v: str
Expand All @@ -206,19 +195,40 @@ class W2V(I2V):
the parameters passed to t2v

Examples
--------
---------
>>> i2v = get_pretrained_i2v("test_w2v", "examples/test_model/data/w2v")
>>> item_vector, token_vector = i2v(["有学者认为:‘学习’,必须适应实际"])
>>> item_vector
array([[...]], dtype=float32)
>>> item_vector # doctest: +ELLIPSIS
[array([...], dtype=float32)]

Returns
-------
--------
i2v model: W2V

"""
def infer_vector(self, items, tokenize=True, indexing=False, padding=False, key=lambda x: x, *args,
**kwargs) -> tuple:
'''

Parameters
-----------
items:str
the text of question
tokenize:bool
True: tokenize the item
indexing:bool
padding:bool
key: lambda function
the parameter passed to tokenizer, select the text to be processed
args:
the parameters passed to t2v
kwargs:
the parameters passed to t2v

Returns
--------
vector:list
'''
tokens = self.tokenize(items, return_token=True) if tokenize is True else items
tokens = [token for token in tokens]
return self.t2v(tokens, *args, **kwargs), self.t2v.infer_tokens(tokens, *args, **kwargs)
Expand All @@ -244,18 +254,18 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR):
"""

Parameters
----------
-----------
name: str
the name of item2vector model
model_dir:str
the path of model, default: MODEL_DIR = '~/.EduNLP/model'

Returns
-------
--------
i2v model: I2V

Examples
--------
---------
>>> item = {"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, \
... 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,\
... 此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$"}
Expand Down
30 changes: 14 additions & 16 deletions EduNLP/ModelZoo/rnn/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,20 @@

class LM(nn.Module):
"""

Parameters
----------
rnn_type:str
Legal types including RNN, LSTM, GRU,ELMO
vocab_size: int
embedding_dim: int
hidden_size: int
num_layers
bidirectional
embedding
model_params
kwargs

Examples
--------
>>> import torch
Expand All @@ -30,22 +44,6 @@ class LM(nn.Module):

def __init__(self, rnn_type: str, vocab_size: int, embedding_dim: int, hidden_size: int, num_layers=1,
bidirectional=False, embedding=None, model_params=None, **kwargs):
"""

Parameters
----------
rnn_type:str
Legal types including RNN, LSTM, GRU,ELMO
vocab_size: int
embedding_dim: int
hidden_size: int
num_layers
bidirectional
embedding
model_params
kwargs

"""
super(LM, self).__init__()
rnn_type = rnn_type.upper()
self.embedding = torch.nn.Embedding(vocab_size, embedding_dim) if embedding is None else embedding
Expand Down
33 changes: 11 additions & 22 deletions EduNLP/ModelZoo/utils/masker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@

class Masker(object):
"""

Parameters
----------
mask: int, str
per
seed

Examples
-------
>>> masker = Masker(per=0.5, seed=10)
Expand All @@ -29,35 +36,17 @@ class Masker(object):
[['a', '[MASK]', 'c'], ['d', '[PAD]', '[PAD]'], ['hello', '[MASK]', '[PAD]']]
>>> mask_label
[[0, 1, 0], [0, 0, 0], [0, 1, 0]]
"""

Returns
----------
list:list of masked_seq and list of masked_list
"""
def __init__(self, mask: (int, str, ...) = 0, per=0.2, seed=None):
"""

Parameters
----------
mask: int, str
per
seed
"""
self.seed = np.random.default_rng(seed)
self.per = per
self.mask = mask

def __call__(self, seqs, length=None, *args, **kwargs) -> tuple:
"""

Parameters
----------
seqs:list
length
args
kwargs

Returns
----------
list:list of masked_seq and list of masked_list
"""
seqs = deepcopy(seqs)
masked_list = []
if length is None:
Expand Down
Loading