# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [2]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [16]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'are',
 'so',
 'now',
 'can',
 'but',
 'not',
 'or',
 'we',
 'do',
 'be',
 'at',
 'will',
 'if',
 'ur',
 'no',
 'with',
 'get',
 'just',
 'gt',
 'this',
 'lt',
 'how',
 'go',
 'ok',
 'up',
 'what',
 'from',
 'when',
 'all',
 'out',
 'll',
 'then',
 'know',
 'free',
 'like',
 'good',
 'am',
 'he',
 'day',
 'there',
 'come',
 'was',
 'got',
 'its',
 'time',
 'only',
 'love',
 'send',
 'want',
 'text',
 'one',
 'txt',
 'as',
 'she',
 'going',
 'today',
 'don',
 'by',
 'home',
 'need',
 'see',
 'lor',
 'still',
 'sorry',
 'stop',
 'our',
 'back',
 'about',
 'reply',
 'take',
 'they',
 'think',
 'dont',
 'tell',
 'later',
 'week',
 'mobile',
 'new',
 'pls',
 'hi',
 'her',
 'da',
 'any',
 'please',
 'some',
 'did',
 'here',
 'oh',
 'phone',
 'ì_',
 'been',
 're',
 'well',
 'wat',
 'night',
 'an',
 'much',
 'where',
 'has',
 'who',
 'great',
 'dear',
 'hope',
 'hey',
 '

The following code does not work because it would create an np.array where each line has a different shape. Therefore we split this into 2 steps. 
- first we create a list with the vectors with differents lengths 
- second we create an average documents and add it. 
- one special remark: as there are some texts where no word is occuring in any other document, there are empty vectors in the list. For those we need to create a np.zeros(100) array and add it

In [17]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])
#                      for ls in X_test])

In [4]:
w2v_vec_list = []
for ls in X_test:
    for i in ls:
        w2v_vect = [np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])]
        w2v_vec_list.append(w2v_vect)

In [5]:
w2v_vec_list

[[array([[-0.29390493,  0.37649992,  0.13684018, ..., -0.38990715,
           0.3019739 , -0.06233209],
         [-0.20522441,  0.26842877,  0.10511894, ..., -0.28463173,
           0.21061383, -0.04791604],
         [-0.29536226,  0.39648786,  0.14376968, ..., -0.41111028,
           0.29606214, -0.07591326],
         ...,
         [-0.02947943,  0.05798964,  0.02124812, ..., -0.05407246,
           0.03764087, -0.01459569],
         [-0.35203823,  0.46037737,  0.17624779, ..., -0.474561  ,
           0.35060397, -0.07470272],
         [-0.13741486,  0.17509903,  0.07577408, ..., -0.196213  ,
           0.14113252, -0.0349516 ]], dtype=float32)],
 [array([[-0.29390493,  0.37649992,  0.13684018, ..., -0.38990715,
           0.3019739 , -0.06233209],
         [-0.20522441,  0.26842877,  0.10511894, ..., -0.28463173,
           0.21061383, -0.04791604],
         [-0.29536226,  0.39648786,  0.14376968, ..., -0.41111028,
           0.29606214, -0.07591326],
         ...,
         [-0.02947

In [20]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vec_list):
    print(len(X_test.iloc[i]), len(v))

20 16725


In [8]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vec_list:
    if len(vect)!=0:
        w2v_vect_avg.append(np.array(vect).mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [10]:
w2v_vect_avg[0]

array([[-0.29390493,  0.37649992,  0.13684018, ..., -0.38990715,
         0.3019739 , -0.06233209],
       [-0.20522441,  0.26842877,  0.10511894, ..., -0.28463173,
         0.21061383, -0.04791604],
       [-0.29536226,  0.39648786,  0.14376968, ..., -0.41111028,
         0.29606214, -0.07591326],
       ...,
       [-0.02947943,  0.05798964,  0.02124812, ..., -0.05407246,
         0.03764087, -0.01459569],
       [-0.35203823,  0.46037737,  0.17624779, ..., -0.474561  ,
         0.35060397, -0.07470272],
       [-0.13741486,  0.17509903,  0.07577408, ..., -0.196213  ,
         0.14113252, -0.0349516 ]], dtype=float32)

In [18]:
w2v_vect_avg[20].shape

(15, 100)

In [13]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

15 14
18 14
17 14
9 14
19 14
11 14
4 14
1 14
12 14
12 14
19 14
17 14
16 14
9 14
10 14
7 15
8 15
5 15
11 15
9 15
27 15
14 15
10 15
15 15
6 15
29 15
19 15
4 15
1 15
8 15
24 15
8 15
3 15
10 16
7 16
6 16
18 16
22 16
25 16
24 16
17 16
6 16
11 16
12 16
19 16
9 16
14 16
13 16
7 16
16 16
19 9
22 9
6 9
17 9
12 9
7 9
19 9
28 9
22 9
5 17
9 17
2 17
14 17
16 17
34 17
28 17
23 17
28 17
5 17
42 17
17 17
60 17
8 17
15 17
7 17
3 17
24 17
31 17
27 10
15 10
7 10
5 10
5 10
22 10
10 10
26 10
19 10
9 10
11 10
12 4
3 4
8 4
21 4
8 0
25 12
11 12
5 12
6 12
13 12
11 12
17 12
5 12
3 12
32 12
8 12
5 12
12 11
5 11
21 11
11 11
6 11
29 11
29 11
4 11
23 11
30 11
5 11
20 11
27 14
4 14
25 14
27 14
5 14
23 14
25 14
8 14
13 14
26 14
5 14
27 14
24 14
5 14
19 14
20 14
23 14
8 14
5 14
4 14
9 14
1 14
21 14
7 14
11 14
2 14
36 14
5 14
9 14
21 14
6 14
4 14
11 14
6 14
15 14
29 14
14 12
50 12
6 12
14 12
15 12
7 12
9 12
22 12
7 12
3 12
8 12
22 12
27 12
12 12
17 12
5 12
17 8
10 8
17 8
7 8
7 8
18 8
5 8
19 8
14 8
7 9
22 9
4 9
15 9
5 9

IndexError: single positional indexer is out-of-bounds