-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.py
153 lines (137 loc) · 4.7 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pdb
import sys
WORD_POS = 1
TAG_POS = 2
MASK_TAG = "__entity__"
INPUT_MASK_TAG = ":__entity__"
RESET_POS_TAG='RESET'
noun_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','POS','CD']
cap_tags = ['NFP','JJ','NN','FW','NNS','NNPS','JJS','JJR','NNP','PRP']
def detect_masked_positions(terms_arr):
sentence_arr,span_arr = generate_masked_sentences(terms_arr)
new_sent_arr = []
for i in range(len(terms_arr)):
new_sent_arr.append(terms_arr[i][WORD_POS])
return new_sent_arr,sentence_arr,span_arr
def generate_masked_sentences(terms_arr):
size = len(terms_arr)
sentence_arr = []
span_arr = []
i = 0
hack_for_no_nouns_case(terms_arr)
while (i < size):
term_info = terms_arr[i]
if (term_info[TAG_POS] in noun_tags):
skip = gen_sentence(sentence_arr,terms_arr,i)
i += skip
for j in range(skip):
span_arr.append(1)
else:
i += 1
span_arr.append(0)
#print(sentence_arr)
return sentence_arr,span_arr
def hack_for_no_nouns_case(terms_arr):
'''
This is just a hack for case user enters a sentence with no entity to be tagged specifically and the sentence has no nouns
Happens for odd inputs like a single word like "eg" etc.
Just make the first term as a noun to proceed.
'''
size = len(terms_arr)
i = 0
found = False
while (i < size):
term_info = terms_arr[i]
if (term_info[TAG_POS] in noun_tags):
found = True
break
else:
i += 1
if (not found and len(terms_arr) >= 1):
term_info = terms_arr[0]
term_info[TAG_POS] = noun_tags[0]
def gen_sentence(sentence_arr,terms_arr,index):
size = len(terms_arr)
new_sent = []
for prefix,term in enumerate(terms_arr[:index]):
new_sent.append(term[WORD_POS])
i = index
skip = 0
while (i < size):
if (terms_arr[i][TAG_POS] in noun_tags):
skip += 1
i += 1
else:
break
new_sent.append(MASK_TAG)
i = index + skip
while (i < size):
new_sent.append(terms_arr[i][WORD_POS])
i += 1
assert(skip != 0)
sentence_arr.append(new_sent)
return skip
def capitalize(terms_arr):
for i,term_tag in enumerate(terms_arr):
#print(term_tag)
if (term_tag[TAG_POS] in cap_tags):
word = term_tag[WORD_POS][0].upper() + term_tag[WORD_POS][1:]
term_tag[WORD_POS] = word
#print(terms_arr)
def set_POS_based_on_entities(sent):
terms_arr = []
sent_arr = sent.split()
for i,word in enumerate(sent_arr):
#print(term_tag)
term_tag = ['-']*5
if (word.endswith(INPUT_MASK_TAG)):
term_tag[TAG_POS] = noun_tags[0]
term_tag[WORD_POS] = word.replace(INPUT_MASK_TAG,"")
else:
term_tag[TAG_POS] = RESET_POS_TAG
term_tag[WORD_POS] = word
terms_arr.append(term_tag)
return terms_arr
#print(terms_arr)
def filter_common_noun_spans(span_arr,masked_sent_arr,terms_arr,common_descs):
ret_span_arr = span_arr.copy()
ret_masked_sent_arr = []
sent_index = 0
loop_span_index = 0
while (loop_span_index < len(span_arr)):
span_val = span_arr[loop_span_index]
orig_index = loop_span_index
if (span_val == 1):
curr_index = orig_index
is_all_common = True
while (curr_index < len(span_arr) and span_arr[curr_index] == 1):
term = terms_arr[curr_index]
if (term[WORD_POS].lower() not in common_descs):
is_all_common = False
curr_index += 1
loop_span_index = curr_index #note the loop scan index is updated
if (is_all_common):
curr_index = orig_index
print("Filtering common span: ",end='')
while (curr_index < len(span_arr) and span_arr[curr_index] == 1):
print(terms_arr[curr_index][WORD_POS],' ',end='')
ret_span_arr[curr_index] = 0
curr_index += 1
print()
sent_index += 1 # we are skipping a span
else:
ret_masked_sent_arr.append(masked_sent_arr[sent_index])
sent_index += 1
else:
loop_span_index += 1
return ret_masked_sent_arr,ret_span_arr
def normalize_casing(sent):
sent_arr = sent.split()
ret_sent_arr = []
for i,word in enumerate(sent_arr):
if (len(word) > 1):
norm_word = word[0] + word[1:].lower()
else:
norm_word = word[0]
ret_sent_arr.append(norm_word)
return ' '.join(ret_sent_arr)