# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [2]:
X_test.index

Index([1218, 4584, 4127, 4442, 1226, 5327, 3494, 4573,  359, 3175,
       ...
       2443, 4766, 1978, 4616, 5208, 4780,  261,  700,  592, 5201],
      dtype='int64', length=1115)

In [3]:
X_test.head()

1218                                                   [also, fine, when, will, you, complete, the, course]
4584    [have, secret, admirer, who, is, looking, make, contact, with, find, out, who, they, reveal, who...
4127                                                         [dont, thnk, its, wrong, calling, between, us]
4442    [you, know, my, old, dom, told, you, about, yesterday, his, name, is, roger, he, got, in, touch,...
1226    [reply, with, your, name, and, address, and, you, will, receive, by, post, weeks, completely, fr...
Name: text_clean, dtype: object

In [4]:
X_test.iloc[0]

['also', 'fine', 'when', 'will', 'you', 'complete', 'the', 'course']

### Prep Word Vectors

In [5]:
# Generate a list of words the word2vec model learned word vectors for
low = w2v_model.wv.index_to_key
low
# vocab = list(w2v_model.wv.key_to_index)

['you',
 'to',
 'the',
 'and',
 'is',
 'in',
 'my',
 'me',
 'it',
 'your',
 'for',
 'of',
 'call',
 'that',
 'have',
 'on',
 'are',
 'now',
 'can',
 'not',
 'but',
 'so',
 'or',
 'do',
 'we',
 'be',
 'at',
 'get',
 'will',
 'with',
 'if',
 'ur',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'what',
 'up',
 'free',
 'when',
 'go',
 'ok',
 'all',
 'from',
 'know',
 'out',
 'll',
 'day',
 'like',
 'am',
 'was',
 'good',
 'there',
 'time',
 'got',
 'then',
 'he',
 'come',
 'its',
 'love',
 'only',
 'want',
 'send',
 'as',
 'text',
 'going',
 'txt',
 'one',
 'need',
 'don',
 'by',
 'today',
 'about',
 'see',
 'lor',
 'she',
 'da',
 'sorry',
 'back',
 'still',
 'home',
 'stop',
 'dont',
 'reply',
 'our',
 'they',
 'mobile',
 'hi',
 'please',
 'later',
 'any',
 'new',
 'tell',
 'phone',
 'pls',
 'think',
 'did',
 'take',
 'ì_',
 'been',
 'her',
 'dear',
 'well',
 'who',
 're',
 'week',
 'here',
 'msg',
 'happy',
 'great',
 'has',
 'where',
 'some',
 'night',
 'hey',
 'hope',
 'wat',
 'oh',


In [6]:
ls = X_test.iloc[0]
lc = [w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]
# print(lc)
print('===========================')
object = np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]).mean(axis=0)
print(object)

[-0.09667503  0.31851265  0.06067233 -0.01405787  0.12860624 -0.67887014
  0.39268255  0.8877613  -0.31734636 -0.28899232 -0.07003361 -0.51915455
  0.02936978  0.2707298   0.1754028  -0.21361202  0.1630767  -0.2217654
 -0.25751373 -0.8273423   0.20071279  0.1772053   0.24267769 -0.32504535
 -0.08812137 -0.02688666 -0.4351803  -0.27323347 -0.3045668   0.10056137
  0.42149305  0.13609025  0.10897364 -0.36719966 -0.11383591  0.56865525
  0.20375554 -0.28818244 -0.23281324 -0.62962437 -0.0203548  -0.43748775
 -0.13363731  0.02410258  0.29158992 -0.11175839 -0.28433287 -0.07824206
  0.22016308  0.24101199  0.23552562 -0.3671031   0.00246721 -0.14694142
 -0.21819404  0.13972288  0.30457795  0.01426777 -0.36747697  0.20166264
  0.2023842  -0.02401384  0.05048126 -0.14876209 -0.41212308  0.43802682
  0.08220975  0.4900029  -0.5962104   0.43330505 -0.31938675  0.23674545
  0.5109622  -0.0822748   0.49977335  0.20767625 -0.10077619 -0.04662798
 -0.43667758  0.16026913 -0.15810634  0.09835847 -0.

In [7]:
X_test.index

Index([1218, 4584, 4127, 4442, 1226, 5327, 3494, 4573,  359, 3175,
       ...
       2443, 4766, 1978, 4616, 5208, 4780,  261,  700,  592, 5201],
      dtype='int64', length=1115)


In this test we have a text where not one word is occuring also in another text. This makes the logic a bit more complex. 

In [8]:
X_test.iloc[219]

['ll', 'work', 'something', 'out']

In [9]:
['ultimately', 'tor', 'motive', 'tui', 'achieve', 'korli'] in low

False

The original code does not work because it would create an np.array where each line has a different shape. Therefore we split this into 2 steps. 
- first we create a list with the vectors with different lengths 
- second we create an average value for each of the allowed words in each of the 100 dimensions of the document. 
- one special remark: as there are some texts where no word is occuring in any other document, there are empty vectors in the list. For those we need to create a np.zeros(100) array and add it

In [10]:
w2v_vect = []
for i in range(len(X_test)):
    ls = X_test.iloc[i]
    vector = np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])
    if len(vector) == 0:
        w2v_vect.append(np.zeros(100))
    else:
        w2v_vect.append(vector.mean(axis=0))

w2v_vect

[array([-0.09667503,  0.31851265,  0.06067233, -0.01405787,  0.12860624,
        -0.67887014,  0.39268255,  0.8877613 , -0.31734636, -0.28899232,
        -0.07003361, -0.51915455,  0.02936978,  0.2707298 ,  0.1754028 ,
        -0.21361202,  0.1630767 , -0.2217654 , -0.25751373, -0.8273423 ,
         0.20071279,  0.1772053 ,  0.24267769, -0.32504535, -0.08812137,
        -0.02688666, -0.4351803 , -0.27323347, -0.3045668 ,  0.10056137,
         0.42149305,  0.13609025,  0.10897364, -0.36719966, -0.11383591,
         0.56865525,  0.20375554, -0.28818244, -0.23281324, -0.62962437,
        -0.0203548 , -0.43748775, -0.13363731,  0.02410258,  0.29158992,
        -0.11175839, -0.28433287, -0.07824206,  0.22016308,  0.24101199,
         0.23552562, -0.3671031 ,  0.00246721, -0.14694142, -0.21819404,
         0.13972288,  0.30457795,  0.01426777, -0.36747697,  0.20166264,
         0.2023842 , -0.02401384,  0.05048126, -0.14876209, -0.41212308,
         0.43802682,  0.08220975,  0.4900029 , -0.5

In [11]:
w2v_vect_22 = [np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]).mean(axis=0) if len(np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])) != 0 else np.zeros(100) for ls in X_test]

In [12]:
np.array_equal(w2v_vect, w2v_vect_22)

True

In [13]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

As there was some confusion in my head about the data structure, here some explanations.

In [14]:
import numpy as np

np.array_equal(w2v_vect, w2v_vect_avg)



False

In [15]:
np.array(w2v_vect).shape

(1115, 100)

In [17]:
w2v_vect_avg[0]

0.01201683

In [18]:
w2v_vect[0]

array([-0.09667503,  0.31851265,  0.06067233, -0.01405787,  0.12860624,
       -0.67887014,  0.39268255,  0.8877613 , -0.31734636, -0.28899232,
       -0.07003361, -0.51915455,  0.02936978,  0.2707298 ,  0.1754028 ,
       -0.21361202,  0.1630767 , -0.2217654 , -0.25751373, -0.8273423 ,
        0.20071279,  0.1772053 ,  0.24267769, -0.32504535, -0.08812137,
       -0.02688666, -0.4351803 , -0.27323347, -0.3045668 ,  0.10056137,
        0.42149305,  0.13609025,  0.10897364, -0.36719966, -0.11383591,
        0.56865525,  0.20375554, -0.28818244, -0.23281324, -0.62962437,
       -0.0203548 , -0.43748775, -0.13363731,  0.02410258,  0.29158992,
       -0.11175839, -0.28433287, -0.07824206,  0.22016308,  0.24101199,
        0.23552562, -0.3671031 ,  0.00246721, -0.14694142, -0.21819404,
        0.13972288,  0.30457795,  0.01426777, -0.36747697,  0.20166264,
        0.2023842 , -0.02401384,  0.05048126, -0.14876209, -0.41212308,
        0.43802682,  0.08220975,  0.4900029 , -0.5962104 ,  0.43

In [19]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

8 100
23 100
7 100
30 100
24 100
4 100
22 100
4 100
16 100
10 100
11 100
7 100
7 100
22 100
9 100
27 100
23 100
5 100
8 100
47 100
7 100
3 100
16 100
5 100
15 100
9 100
10 100
17 100
5 100
8 100
23 100
8 100
2 100
31 100
7 100
13 100
21 100
6 100
28 100
10 100
2 100
23 100
16 100
17 100
0 100
5 100
21 100
10 100
18 100
23 100
11 100
23 100
22 100
26 100
15 100
22 100
5 100
9 100
8 100
4 100
7 100
21 100
7 100
8 100
6 100
4 100
8 100
26 100
9 100
7 100
20 100
36 100
4 100
18 100
12 100
3 100
3 100
19 100
11 100
8 100
7 100
4 100
5 100
18 100
8 100
7 100
30 100
8 100
20 100
30 100
12 100
9 100
6 100
9 100
16 100
24 100
15 100
21 100
15 100
11 100
17 100
8 100
6 100
7 100
9 100
25 100
20 100
10 100
9 100
25 100
11 100
27 100
4 100
7 100
4 100
6 100
29 100
7 100
5 100
31 100
11 100
22 100
10 100
9 100
27 100
19 100
14 100
0 100
3 100
18 100
16 100
23 100
22 100
10 100
7 100
9 100
24 100
4 100
25 100
6 100
14 100
5 100
22 100
20 100
8 100
3 100
18 100
20 100
26 100
6 100
6 100
6 100
11 100


In [20]:
len(X_test.iloc[2])

7