# Doc2vec

In [1]:
# court judgement sentences.
corpus = [" and the rest of the estate then in dispute was given to B. The operative part of the award stated inter alia that B, first party, and M, the second party, were held entitled to speci- fied shares in the properties in dispute and each had become permanent owner (Malik Mustaqil) of his or her share", 
        "A division was effected and ever sinThe dispute was referred to arbitration and an award was delivered. Under it the suit properties were given to Mce the date of the award in 1884 each branch continued in possession of the proper- ties allotted to it and each had been dealing with them as absolute owner. The defendants claimed that the plaintiffs were bound by the award and were in any event estopped from challenging it",
        "Held, that the award gave an absolute estate to M as the words 'Malik Mustaqil' were strong. clear and unambiguous and were not qualified by. other words and circumstances appearing in the same document in the present case."]


In [2]:
# import the library
import gensim

# read the corpus and convert it to tagged document.
def read_corpus():
    i = 0
    for doc_fname in corpus:
        tokens = doc_fname.split()
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
        i = i + 1

In [3]:
# get the tagged corpus
train_corpus = list(read_corpus())

In [4]:
# first tagged document
train_corpus[0]

TaggedDocument(words=['The', 'dispute', 'was', 'referred', 'to', 'arbitration', 'and', 'an', 'award', 'was', 'delivered.', 'Under', 'it', 'the', 'suit', 'properties', 'were', 'given', 'to', 'M', 'and', 'the', 'rest', 'of', 'the', 'estate', 'then', 'in', 'dispute', 'was', 'given', 'to', 'B.', 'The', 'operative', 'part', 'of', 'the', 'award', 'stated', 'inter', 'alia', 'that', 'B,', 'first', 'party,', 'and', 'M,', 'the', 'second', 'party,', 'were', 'held', 'entitled', 'to', 'speci-', 'fied', 'shares', 'in', 'the', 'properties', 'in', 'dispute', 'and', 'each', 'had', 'become', 'permanent', 'owner', '(Malik', 'Mustaqil)', 'of', 'his', 'or', 'her', 'share'], tags=[0])

In [5]:
# setup doc2vec
model = gensim.models.doc2vec.Doc2Vec(vector_size=100)

In [6]:
# build the vocabulary
model.build_vocab(train_corpus)

In [7]:
# train the doc2vec on the given corpus
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
my_new_document = train_corpus[0][0]

In [11]:
my_new_document

['The',
 'dispute',
 'was',
 'referred',
 'to',
 'arbitration',
 'and',
 'an',
 'award',
 'was',
 'delivered.',
 'Under',
 'it',
 'the',
 'suit',
 'properties',
 'were',
 'given',
 'to',
 'M',
 'and',
 'the',
 'rest',
 'of',
 'the',
 'estate',
 'then',
 'in',
 'dispute',
 'was',
 'given',
 'to',
 'B.',
 'The',
 'operative',
 'part',
 'of',
 'the',
 'award',
 'stated',
 'inter',
 'alia',
 'that',
 'B,',
 'first',
 'party,',
 'and',
 'M,',
 'the',
 'second',
 'party,',
 'were',
 'held',
 'entitled',
 'to',
 'speci-',
 'fied',
 'shares',
 'in',
 'the',
 'properties',
 'in',
 'dispute',
 'and',
 'each',
 'had',
 'become',
 'permanent',
 'owner',
 '(Malik',
 'Mustaqil)',
 'of',
 'his',
 'or',
 'her',
 'share']

In [12]:
# get the doc2vec embedding of the first document.
model.infer_vector(my_new_document)

array([ 3.3247492e-03, -4.0207054e-03, -2.1467444e-03,  1.9124809e-03,
        5.0939023e-03, -4.8542190e-03,  2.8592285e-03,  3.8582142e-03,
       -4.6573943e-03,  1.5120165e-03,  1.2299906e-03, -4.4296167e-04,
        1.0924609e-03, -2.2577890e-03, -4.5439731e-03, -3.8971163e-03,
       -3.2916775e-03,  7.8172883e-04, -1.0083753e-03, -2.0875195e-03,
        5.5897486e-04, -4.2514731e-03,  3.4082832e-03, -2.8479458e-03,
       -4.1665016e-03,  3.1323400e-03, -5.7045894e-04,  1.3864173e-03,
       -4.7230283e-03,  2.5023960e-03,  5.1227649e-04, -3.8079340e-03,
        4.7655450e-03,  3.7911988e-03,  2.5075473e-04, -1.8405446e-03,
        1.0217074e-03,  2.9051153e-03,  2.0782326e-03,  1.3887638e-03,
        2.4244341e-03, -5.6080328e-04,  3.3532672e-03,  3.1555761e-03,
        1.3592814e-03,  4.0950533e-03, -1.4228240e-03,  4.5004128e-03,
        4.0246188e-03, -4.5079566e-03, -3.7081582e-03,  3.2071138e-03,
        2.4512077e-03,  4.5169448e-03,  4.5031477e-03, -3.4413505e-03,
      