This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
bias_utils.py
111 lines (96 loc) · 3.71 KB
/
bias_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import torch
import json
from os import PathLike
from typing import List, Tuple, Union, Optional
from allennlp.common.file_utils import cached_path
from allennlp.data import Vocabulary
from allennlp.data.tokenizers.tokenizer import Tokenizer
def _convert_word_to_ids_tensor(word, tokenizer, vocab, namespace, all_cases):
# function does NOT strip special tokens if tokenizer adds them
if all_cases:
words_list = [word.lower(), word.title(), word.upper()]
else:
words_list = [word]
ids = []
for w in words_list:
# if vocab is None, use tokenizer vocab (only works for Huggingface PreTrainedTokenizer)
if vocab:
tokens = tokenizer.tokenize(w)
ids.append(torch.tensor([vocab.get_token_index(t.text, namespace) for t in tokens]))
else:
ids.append(torch.tensor(tokenizer.tokenizer(w)["input_ids"]))
return ids
def load_words(
fname: Union[str, PathLike],
tokenizer: Tokenizer,
vocab: Optional[Vocabulary] = None,
namespace: str = "tokens",
all_cases: bool = True,
) -> List[torch.Tensor]:
"""
This function loads a list of words from a file,
tokenizes each word into subword tokens, and converts the
tokens into IDs.
# Parameters
fname : `Union[str, PathLike]`
Name of file containing list of words to load.
tokenizer : `Tokenizer`
Tokenizer to tokenize words in file.
vocab : `Vocabulary`, optional (default=`None`)
Vocabulary of tokenizer. If `None`, assumes tokenizer is of
type `PreTrainedTokenizer` and uses tokenizer's `vocab` attribute.
namespace : `str`
Namespace of vocab to use when tokenizing.
all_cases : `bool`, optional (default=`True`)
Whether to tokenize lower, title, and upper cases of each word.
# Returns
word_ids : `List[torch.Tensor]`
List of tensors containing the IDs of subword tokens for
each word in the file.
"""
word_ids = []
with open(cached_path(fname)) as f:
words = json.load(f)
for w in words:
word_ids.extend(_convert_word_to_ids_tensor(w, tokenizer, vocab, namespace, all_cases))
return word_ids
def load_word_pairs(
fname: Union[str, PathLike],
tokenizer: Tokenizer,
vocab: Optional[Vocabulary] = None,
namespace: str = "token",
all_cases: bool = True,
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
"""
This function loads a list of pairs of words from a file,
tokenizes each word into subword tokens, and converts the
tokens into IDs.
# Parameters
fname : `Union[str, PathLike]`
Name of file containing list of pairs of words to load.
tokenizer : `Tokenizer`
Tokenizer to tokenize words in file.
vocab : `Vocabulary`, optional (default=`None`)
Vocabulary of tokenizer. If `None`, assumes tokenizer is of
type `PreTrainedTokenizer` and uses tokenizer's `vocab` attribute.
namespace : `str`
Namespace of vocab to use when tokenizing.
all_cases : `bool`, optional (default=`True`)
Whether to tokenize lower, title, and upper cases of each word.
# Returns
word_ids : `Tuple[List[torch.Tensor], List[torch.Tensor]]`
Pair of lists of tensors containing the IDs of subword tokens for
words in the file.
"""
word_ids1 = []
word_ids2 = []
with open(cached_path(fname)) as f:
words = json.load(f)
for w1, w2 in words:
word_ids1.extend(
_convert_word_to_ids_tensor(w1, tokenizer, vocab, namespace, all_cases)
)
word_ids2.extend(
_convert_word_to_ids_tensor(w2, tokenizer, vocab, namespace, all_cases)
)
return word_ids1, word_ids2