/
avg_word2vec.py
66 lines (50 loc) · 2 KB
/
avg_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from load_data import load_train_data, load_processed_data
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# The following skills is useful
# train_test_split(np.array(texts), np.array(sentiemnt), test_size=0.2)
x_train, y_train = load_processed_data(data_type='train', stem=False)
x_test, y_test = load_processed_data(data_type='test', stem=False)
from preprocess import preprocessor as preprocess
n_dim = 100
scaling = False
# Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
# Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(x_train)
# Build word vector for training set by using the average value of all word vectors in the tweet, then scale
# from load_data import load_word_embedding
# imdb_w2v = load_word_embedding()
def buildWordVector(text, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
if scaling == True:
train_vecs = scale(train_vecs)
# Train word2vec on test tweets
# imdb_w2v.train(x_test)
# Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
if scaling == True:
test_vecs = scale(test_vecs)
min_max_scaler = MinMaxScaler()
train_vecs = min_max_scaler.fit_transform(train_vecs)
test_vecs = min_max_scaler.fit_transform(test_vecs)
# Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set
from classifiers import gNB, mNB
from analysis import analysis_result
pre = mNB(train_vecs, y_train, test_vecs)
analysis_result(pre, y_test)