# doc2vec: How To Implement doc2vec

### Train Our Own Model

In [1]:
# Read in data, clean it, and then split into train and test sets
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2, random_state=42)

In [4]:
# Create tagged document objects to prepare to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [5]:
# Look at what a tagged document looks like
tagged_docs[0]

TaggedDocument(words=['no', 'in', 'the', 'same', 'boat', 'still', 'here', 'at', 'my', 'moms', 'check', 'me', 'out', 'on', 'yo', 'half', 'naked'], tags=[0])

In [7]:
# Train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs,
                                  vector_size=100,
                                  window=5,
                                  min_count=2)

In [9]:
# What happens if we pass in a single word like we did for word2vec?
d2v_model.infer_vector(['this', 'is', 'a', 'huge', 'list', 'of', 'text'])

array([-1.07916268e-02, -2.55744322e-03,  4.33045160e-03,  7.02752452e-03,
        1.32111087e-03, -1.99317746e-02,  3.10294633e-03,  2.55917963e-02,
       -9.92034562e-03, -2.41003837e-03, -1.61863994e-02, -3.14948037e-02,
       -3.51460208e-03,  6.84878463e-03,  3.08868848e-03, -8.30354076e-03,
       -1.23882620e-03, -2.07861215e-02,  7.59332674e-03, -2.31476687e-02,
        2.80159991e-03,  2.95096938e-03,  1.26693761e-02,  6.11723261e-03,
       -1.32910197e-03, -5.69444243e-03, -1.68613400e-02, -2.51554069e-03,
       -7.66403507e-03, -3.06299562e-03,  6.43154560e-03,  1.31174427e-04,
        8.74533318e-03, -8.14582279e-04, -1.78447030e-02,  7.03175971e-03,
        7.22076977e-03, -2.10495200e-02, -1.85112078e-02, -1.95066109e-02,
        1.45747563e-05, -1.76859144e-02, -8.77946895e-03, -4.22434974e-03,
        7.11523322e-03, -7.08890427e-03, -6.31457800e-03,  2.39501009e-03,
        4.44851164e-03,  1.23836370e-02,  6.93360576e-03, -1.17320977e-02,
        9.63179627e-04, -

In [10]:
d2v_model.infer_vector(['text'])

array([-0.01700153,  0.00577108,  0.0101187 ,  0.00490633, -0.00031176,
       -0.02831088,  0.00761476,  0.04902763, -0.02448422, -0.01226932,
       -0.02015222, -0.04734956, -0.00930245,  0.01006988,  0.00294703,
       -0.01775146,  0.01644256, -0.022983  ,  0.00643468, -0.04019548,
        0.00051399,  0.00992351,  0.01574207, -0.00346417,  0.00401343,
        0.00473351, -0.02822516, -0.00351303, -0.02285006, -0.00376918,
        0.02334438,  0.00736679,  0.01160903, -0.00379197, -0.02693884,
        0.02561853,  0.0024503 , -0.02500274, -0.01698458, -0.03593631,
       -0.00236918, -0.02297402, -0.01352627, -0.00394192,  0.00231925,
       -0.01610782, -0.00692643, -0.01140777,  0.00688177,  0.0196486 ,
        0.00312065, -0.01585424, -0.00064299, -0.00290738, -0.01947271,
        0.00785106,  0.01799404, -0.00966279, -0.02353499,  0.01477516,
       -0.00295802, -0.00786071,  0.00681761, -0.00878007, -0.02471401,
        0.02451266,  0.00360113,  0.02209847, -0.02125034,  0.03

In [6]:
# What happens if we pass in a list of words?
d2v_model.infer_vector(['i', 'am', 'learning', 'nlp'])

array([ 0.00079993, -0.00241487, -0.003925  , -0.00077622,  0.00487272,
       -0.00461449, -0.00794001,  0.00196816,  0.00256167, -0.00207712,
        0.00389052, -0.0015462 ,  0.00244023,  0.00461275, -0.00154543,
        0.00264698,  0.00525391, -0.0001179 , -0.00114382, -0.00311834,
        0.00311693, -0.00325406, -0.00234914, -0.00691736, -0.00262643,
        0.0018867 ,  0.00836094,  0.00718936,  0.00242413,  0.00249124,
       -0.00931214,  0.00028316, -0.00117485, -0.0015381 , -0.00051082,
       -0.0057246 , -0.00228752,  0.00489969,  0.0020843 , -0.00081044,
        0.0034014 , -0.00524214,  0.00602016,  0.00141408, -0.00834089,
       -0.00269645,  0.00258314,  0.00466208, -0.00043998,  0.00442808,
        0.00301475, -0.00173231, -0.0065721 , -0.00124466,  0.00094067,
        0.00072978,  0.00052168,  0.00407398, -0.0114823 ,  0.00668782,
        0.00651658,  0.00268903,  0.00242151,  0.00690527, -0.00234641,
        0.00398679,  0.00798854, -0.00370699, -0.00117224,  0.00

### What About Pre-trained Document Vectors?

There are not as many options as there are for word vectors. There also is not an easy API to read these in like there is for `word2vec` so it is more time consuming.

Pre-trained vectors from training on Wikipedia and Associated Press News can be found [here](https://github.com/jhlau/doc2vec). Feel free to explore on your own!