# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [1]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [2]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([-0.00588657, -0.01345497, -0.0076227 ,  0.00787828, -0.00113796,
        0.01318541, -0.00376362,  0.00806833,  0.00462006, -0.01070249,
        0.00335836,  0.00533027, -0.00154249, -0.00931606, -0.01430388,
       -0.00493495,  0.00975425, -0.00676674, -0.0057654 , -0.01142108,
        0.00028353, -0.02805323, -0.00432609, -0.00346047,  0.00236725,
        0.01428846,  0.0292182 ,  0.02183636, -0.00208064, -0.00651421,
        0.02450396,  0.00241505, -0.00539691,  0.01151233, -0.01798362,
       -0.01583771, -0.01738526, -0.01745554, -0.02965207, -0.01739364,
        0.00988218, -0.00882445, -0.0020293 , -0.00030057, -0.00246003,
        0.00488268,  0.00261389, -0.01081261, -0.01558405,  0.01303649],
      dtype=float32)

In [3]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [5]:
vectors[0]

[array([-0.00994322, -0.02535141, -0.0275706 ,  0.0372186 , -0.03277962,
         0.01409951, -0.03063007,  0.02595884,  0.01317524, -0.02486182,
        -0.00188695,  0.03218062, -0.00251159, -0.03683198, -0.02507444,
         0.00670246,  0.01665619, -0.00267003, -0.0194735 , -0.02742641,
        -0.01149104, -0.06658466,  0.00973215, -0.00953486, -0.00759479,
         0.05388751,  0.07324857,  0.06694961, -0.02278406, -0.02971641,
         0.0495635 , -0.01989276,  0.01462067,  0.04861971, -0.0159816 ,
        -0.05355471, -0.03093127, -0.05416853, -0.08206316, -0.04864466,
         0.02117483, -0.04071914, -0.00108404,  0.01389418, -0.02505872,
        -0.00552049,  0.00310575, -0.04724972, -0.03109674,  0.0138714 ],
       dtype=float32)]