/
anew_vectorizer.py
119 lines (104 loc) · 4.82 KB
/
anew_vectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
__author__ = 'NLP-PC'
from sklearn.base import BaseEstimator
import numpy as np
import nltk
from load_data import load_extend_anew
from statistics import mean
from nltk import word_tokenize
class anew_vectorizer(BaseEstimator):
# 返回特征名称列表(list),包含用transform()返回的所有的特征
def __init__(self):
self.words, self.arousal, self.valence = load_extend_anew()
self.stemmer = nltk.stem.SnowballStemmer('english')
self.max = 9
self.stemmed_dict = [self.stemmer.stem(w) for w in self.words]
def get_feature_names(self):
return np.array(
['max_valence', 'avg_valence', 'min_valence', 'max_arousal', 'avg_arousal', 'min_arousal']
)
# As we are not implementing a classifier, we can ignore this one and simply return self.
def fit(self, documents, y=None):
return self
def _get_VA(self, d):
print('Stemming, Still working...')
english_stemmer = self.stemmer
stemmed_sent = [english_stemmer.stem(w) for w in word_tokenize(d)]
valence_value = []
arousal_value = []
words, valence, arousal = self.words, self.valence, self.arousal
overlapping_words = (set(stemmed_sent) & set(self.stemmed_dict))
if len(overlapping_words) != 0:
for word in overlapping_words:
ind = self.stemmed_dict.index(word)
valence_value.append(valence[ind])
arousal_value.append(arousal[ind])
max_valence = max(valence)
avg_valence = mean(valence)
min_valence = min(valence)
max_arousal = max(arousal)
avg_arousal = mean(arousal)
min_arousal = min(arousal)
else:
# if nothing mathes, the default value is 4.5
default = 4.5
max_valence = default
avg_valence = default
min_valence = default
max_arousal = default
avg_arousal = default
min_arousal = default
return np.array([max_valence, avg_valence, min_valence, max_arousal, avg_arousal, min_arousal])
# This returns numpy.array(), containing an array of shape (len(documents), len(get_feature_names)).
# This means that for every document in documents, it has to return a value for every feature name in get_feature_names().
def transform(self, documents):
max_valence, avg_valence, min_valence, max_arousal, avg_arousal, min_arousal = np.array(
[self._get_VA(d) for d in documents]).T
result = np.array(
[max_valence, avg_valence, min_valence, max_arousal, avg_arousal, min_arousal]).T
return result / self.max
# fit_transform is no need to be completed, watch out!
# def fit_transform(self, documents):
# return self.fit(documents).transform(documents)
class strength_vectorizer(BaseEstimator):
# 返回特征名称列表(list),包含用transform()返回的所有的特征
def __init__(self):
self.words, _, self.valence = load_extend_anew()
self.stemmer = nltk.stem.SnowballStemmer('english')
self.max = 9
self.stemmed_dict = [self.stemmer.stem(w) for w in self.words]
def get_feature_names(self):
return np.array(
['max_valence', 'avg_valence', 'min_valence']
)
# As we are not implementing a classifier, we can ignore this one and simply return self.
def fit(self, documents, y=None):
return self
def _get_VA(self, d):
print('Stemming, Still working...')
english_stemmer = self.stemmer
stemmed_sent = [english_stemmer.stem(w) for w in word_tokenize(d)]
valence_value = []
words, valence = self.words, self.valence
overlapping_words = (set(stemmed_sent) & set(self.stemmed_dict))
if len(overlapping_words) != 0:
for word in overlapping_words:
ind = self.stemmed_dict.index(word)
valence_value.append(valence[ind])
max_valence = max(valence)
avg_valence = mean(valence)
min_valence = min(valence)
else:
# if nothing mathes, the default value is 4.5
default = 4.5
max_valence = default
avg_valence = default
min_valence = default
return np.array([max_valence, avg_valence, min_valence])
# This returns numpy.array(), containing an array of shape (len(documents), len(get_feature_names)).
# This means that for every document in documents, it has to return a value for every feature name in get_feature_names().
def transform(self, documents):
max_valence, avg_valence, min_valence = np.array(
[self._get_VA(d) for d in documents]).T
result = np.array(
[max_valence, avg_valence, min_valence]).T
return result / self.max