# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [3]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [7]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key], dtype=object) for ls in X_test], dtype=object)

In [8]:
# Generate a list of words the word2vec model learned word vectors for
# all the words tha appeared in the data at LEAST twice
w2v_model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'your',
 'for',
 'of',
 'call',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'or',
 'not',
 'but',
 'do',
 'get',
 'at',
 'we',
 'if',
 'ur',
 'with',
 'be',
 'will',
 'just',
 'no',
 'this',
 'what',
 'when',
 'ok',
 'up',
 'how',
 'free',
 'gt',
 'go',
 'lt',
 'all',
 'from',
 'out',
 'know',
 'll',
 'like',
 'got',
 'am',
 'good',
 'then',
 'its',
 'he',
 'day',
 'only',
 'was',
 'there',
 'come',
 'time',
 'love',
 'want',
 'text',
 'send',
 'txt',
 'need',
 'as',
 'see',
 'home',
 'one',
 'going',
 'by',
 'sorry',
 'today',
 'stop',
 'about',
 'don',
 'lor',
 'still',
 'she',
 'our',
 'mobile',
 'back',
 'da',
 'tell',
 'dont',
 'reply',
 'take',
 'they',
 'pls',
 'later',
 'did',
 'think',
 'new',
 'here',
 'phone',
 'hi',
 'well',
 'who',
 'some',
 'been',
 'please',
 'week',
 'any',
 're',
 'has',
 'ì_',
 'hope',
 'claim',
 'great',
 'dear',
 'hey',
 'oh',
 'night',
 'more',
 'msg',
 'too',
 'an',
 '

In [9]:
# Why is the length of the sentence different than the length of the sentence vector?
# each word = a feature -> throw an error if an example or text message is not the same
for i,v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

5 5
24 22
4 4
16 11
13 10
4 4
15 9
16 14
7 5
8 6
5 5
25 24
7 7
38 24
6 6
8 6
5 5
23 23
5 4
23 22
15 14
14 12
5 5
26 26
14 14
6 3
11 11
4 4
9 9
17 11
5 5
8 8
2 2
8 7
18 17
24 21
14 14
1 1
14 5
6 6
8 8
16 14
17 16
6 6
6 6
20 20
16 16
5 5
6 6
12 12
16 15
8 7
23 22
37 31
10 10
10 10
30 26
12 12
9 8
16 14
4 3
3 2
7 5
22 21
18 17
25 25
23 20
11 7
22 22
14 10
3 3
4 4
176 163
5 5
16 14
19 15
9 9
33 28
4 2
3 3
6 6
6 5
6 6
12 11
4 4
5 3
8 7
11 11
9 9
10 10
17 13
8 7
19 19
11 11
12 7
13 13
25 22
6 6
22 21
7 5
28 27
4 4
10 9
13 10
5 5
15 12
17 13
8 8
14 14
26 25
14 12
24 20
20 18
25 24
5 4
8 8
5 5
20 18
14 14
27 24
19 16
21 20
9 9
1 1
19 17
19 15
14 13
7 7
6 5
28 28
18 16
16 16
60 54
8 3
16 16
14 11
13 13
5 5
3 3
6 5
13 13
17 11
9 9
22 19
9 9
6 6
6 6
10 8
10 7
3 3
7 6
6 5
28 25
4 4
26 26
6 5
7 7
18 17
8 8
8 3
15 15
26 19
12 12
7 6
49 41
10 9
6 6
8 7
13 11
26 23
6 6
19 12
5 5
15 13
7 7
14 14
28 26
8 7
5 4
5 5
21 19
22 20
15 13
4 4
25 24
11 7
6 6
11 9
7 6
26 25
18 17
27 26
58 53
25 24
17 7
7 6
17 16

In [13]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:   # make sure our w2v model learned a word vector for at least one word
        w2v_vect_avg.append(vect.mean(axis=0))  # take the element-wise word average
    else:
        w2v_vect_avg.append(np.zeros(100)) # no word vectors learned by w2v model [0,0..]

In [14]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    print(len(X_test.iloc[i]), len(v))

5 100
24 100
4 100
16 100
13 100
4 100
15 100
16 100
7 100
8 100
5 100
25 100
7 100
38 100
6 100
8 100
5 100
23 100
5 100
23 100
15 100
14 100
5 100
26 100
14 100
6 100
11 100
4 100
9 100
17 100
5 100
8 100
2 100
8 100
18 100
24 100
14 100
1 100
14 100
6 100
8 100
16 100
17 100
6 100
6 100
20 100
16 100
5 100
6 100
12 100
16 100
8 100
23 100
37 100
10 100
10 100
30 100
12 100
9 100
16 100
4 100
3 100
7 100
22 100
18 100
25 100
23 100
11 100
22 100
14 100
3 100
4 100
176 100
5 100
16 100
19 100
9 100
33 100
4 100
3 100
6 100
6 100
6 100
12 100
4 100
5 100
8 100
11 100
9 100
10 100
17 100
8 100
19 100
11 100
12 100
13 100
25 100
6 100
22 100
7 100
28 100
4 100
10 100
13 100
5 100
15 100
17 100
8 100
14 100
26 100
14 100
24 100
20 100
25 100
5 100
8 100
5 100
20 100
14 100
27 100
19 100
21 100
9 100
1 100
19 100
19 100
14 100
7 100
6 100
28 100
18 100
16 100
60 100
8 100
16 100
14 100
13 100
5 100
3 100
6 100
13 100
17 100
9 100
22 100
9 100
6 100
6 100
10 100
10 100
3 100
7 100
6 100
28 