# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [3]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=100,
                                  window=2,
                                  min_count=2)

In [4]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])

array([-0.01066896,  0.01566895,  0.00142194, -0.00167711,  0.00701121,
       -0.0330531 ,  0.00835054,  0.04394748, -0.02874464, -0.0181323 ,
       -0.01095938, -0.02051019, -0.00598317,  0.00456753,  0.01031762,
       -0.02106267,  0.01338805, -0.01392957,  0.00055323, -0.03240522,
        0.01064136,  0.00143415,  0.02011607, -0.01608839, -0.00990354,
        0.00183104, -0.01621856, -0.00504343, -0.00943476, -0.00790128,
        0.02591329,  0.00227647,  0.0058363 , -0.00398631,  0.00540003,
        0.01886241,  0.0013208 , -0.01003852, -0.00100697, -0.02884805,
        0.00745457, -0.0171964 , -0.00014773, -0.0036658 ,  0.00315243,
       -0.00200356, -0.00989692, -0.00586368,  0.00551255,  0.0193392 ,
        0.00906557, -0.00590746,  0.00796474, -0.00130487, -0.01796295,
        0.00845748,  0.00245883,  0.00422488, -0.01658403,  0.00166272,
        0.00447718, -0.00296431, -0.00931085, -0.01106139, -0.02289169,
        0.01918342,  0.01144955,  0.01200264, -0.02178341,  0.01

In [5]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [6]:
vectors[0]

[array([-0.0423329 ,  0.04753733,  0.01907691, -0.0243029 ,  0.03964424,
        -0.12553057,  0.01895039,  0.19128963, -0.12323486, -0.02571414,
        -0.02595935, -0.07850483, -0.02085474,  0.03134756,  0.04028377,
        -0.09835619,  0.03939414, -0.07696181,  0.00367253, -0.13758098,
         0.03665064,  0.00474226,  0.06262276, -0.09305781, -0.03892053,
        -0.00239084, -0.06222324, -0.02271824, -0.05013494, -0.02742428,
         0.10650801, -0.00987155,  0.01701421,  0.01003167,  0.01567598,
         0.0942995 , -0.01802528, -0.05706133, -0.02530632, -0.12638801,
         0.02343461, -0.05536461,  0.00410083,  0.00460904,  0.02120467,
        -0.02435267, -0.04407908, -0.03554843,  0.00759762,  0.06068951,
         0.04500799, -0.03221933,  0.04974123,  0.01357936, -0.09164128,
         0.06035046,  0.02462837,  0.02337646, -0.05429082,  0.02498913,
         0.02490889, -0.01165802, -0.03756445, -0.01483689, -0.1103493 ,
         0.0632799 ,  0.08386902,  0.0168625 , -0.0