-
Notifications
You must be signed in to change notification settings - Fork 5
/
feature_generator.py
150 lines (123 loc) · 5.42 KB
/
feature_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import copy
from logger import Logger
# generates features for tokens in the dataset
class FeatureGenerator():
def __init__(self, w2v_model, word2count, word2idx):
self.__logger = Logger()
self.__logger.println("feature generator created")
self.__we_model = w2v_model
self.word2count = word2count
self.word2idx = word2idx
def generate_features_docs(self, data):
feature_data = copy.deepcopy(data)
X = [self.doc2features(doc_idx, d) for doc_idx, d in enumerate(feature_data)]
self.__logger.println("generated features")
return X
def generate_true_outcome(self, data):
true_outcome_data = copy.deepcopy(data)
y = [self.doc2labels(d) for d in true_outcome_data]
return y
def word2features(self, line, token_idx, line_idx, doc_idx, doc_size):
word = line[token_idx][0]
postag = line[token_idx][1]
nonlocalnertag = line[token_idx][2]
features = dict()
features["bias"] = 1.0
features["word"] = word.lower()
features['word[-3:]'] = word[-3:].lower()
features['word[-2:]'] = word[-2:].lower()
features['word.isupper'] = word.isupper()
features['word.istitle'] = word.istitle()
features['word.isdigit'] = word.isdigit()
features['word.freq'] = self.word_to_count(word)
features['word.idx'] = float(token_idx)
features['line.idx'] = float(line_idx)
features['line.size'] = float(len(line))
features['pos'] = postag
features['pos[-3:]'] = postag[-3:]
features['pos[-2:]'] = postag[-2:]
features['nonlocalner'] = nonlocalnertag
if token_idx > 0:
word1 = line[token_idx-1][0]
postag1 = line[token_idx-1][1]
nonlocalnertag1 = line[token_idx-1][2]
features['word-1'] = word1.lower()
features['pos-1'] = postag1
features['posbigram-1'] = postag1 + postag
features['pos[-3:]'] = postag1[-3:]
features['pos[-2:]'] = postag1[-2:]
features['bigram-1'] = word1.lower() + word.lower()
features['word-1.isupper'] = word1.isupper()
features['word-1.istitle'] = word1.istitle()
features['word-1.isdigit'] = word1.isdigit()
features['word-1[-3:]'] = word1[-3:]
features['word-1[-2:]'] = word1[-2:]
features['word.freq'] = self.word_to_count(word1)
features['word-1.idx']= float(token_idx-1)
features['nonlocalner'] = nonlocalnertag1
else:
features['bigram-1'] = "BOL" + word.lower()
features['BOL'] = 1.0
if token_idx < len(line)-1:
word1 = line[token_idx+1][0]
postag1 = line[token_idx+1][1]
nonlocalnertag1 = line[token_idx+1][2]
features['word+1'] = word1.lower()
features['pos+1'] = postag1
features['posbigram+1'] = postag + postag1
features['pos[-3:]'] = postag1[-3:]
features['pos[-2:]'] = postag1[-2:]
features['bigram+1'] = word.lower() + word1.lower()
features['word+1.isupper'] = word1.isupper()
features['word+1.istitle'] = word1.istitle()
features['word+1.isdigit'] = word1.isdigit()
features['word+1[-3:]'] = word1[-3:]
features['word+1[-2:]'] = word1[-2:]
features['word.freq'] = self.word_to_count(word1)
features['word+1.idx']= float(token_idx+1)
features['nonlocalner'] = nonlocalnertag1
else:
features['bigram+1'] = word.lower() + "EOL"
features['EOL'] = 1.0
"""
if token_idx > 0 and token_idx < len(line)-1:
word1behind = line[token_idx-1][0]
word1ahead = line[token_idx+1][0]
features['trigram-1+1'] = word1behind.lower() + word.lower() + word1ahead.lower()
"""
features = self.add_we_vector(features, word)
if line_idx == 0:
features['BOD'] = 1.0
if line_idx == doc_size:
features['EOD'] = 1.0
return features
def sent2features(self, line, line_idx, doc_idx, doc_size):
return [self.word2features(line, token_idx, line_idx, doc_idx, doc_size) for token_idx in range(len(line))]
def sent2tokens(self, sent):
return [token for token, pos_tag, nonlocalne, label in sent]
def doc2features(self, doc_idx, doc):
return [self.sent2features(doc[line_idx], line_idx, doc_idx, len(doc)-1) for line_idx in range(len(doc))]
def sent2labels(self, sent):
labels = []
for token, pos_tag, nonlocalne, label in sent:
labels.append(label)
return labels
def doc2labels(self, doc):
return [self.sent2labels(sent) for sent in doc]
def word_to_count(self, word):
try:
return float(self.word2count[word.lower()])
except KeyError:
return "UNKNOWN_WORD_COUNT"
def add_we_vector(self, features, word):
try:
for d_idx, dimension in enumerate(self.__we_model[word.lower()]):
features["we_dimen_"+str(d_idx)] = dimension
except KeyError:
features["we_dimen_"+str(0)] = "UNKNOWN_WE"
return features
def similarity_between(self, word1, word2):
try:
return self.__we_model.similarity(word1.lower(), word2.lower())
except:
return "NO_SIMILARITY"