In [1]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [2]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [3]:
#train_corpus
#test_corpus

In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [5]:
model.build_vocab(train_corpus)

In [6]:
print(f"Word 'australia' appeared {model.wv.get_vecattr('australia', 'count')} times in the training corpus.")

Word 'australia' appeared 157 times in the training corpus.


In [7]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.31976154 -0.26476884 -0.16041218  0.27349195 -0.01096832  0.05571494
  0.03510425  0.03197565 -0.24266955 -0.02463971  0.18618184 -0.12118428
 -0.05194681 -0.06369919 -0.05481533 -0.16708986  0.09989088  0.14317323
  0.16305943 -0.05766478  0.01440293  0.03208936  0.10748866  0.08495447
 -0.04660195  0.00640279 -0.22751215 -0.01198588 -0.1814076  -0.06214438
  0.41534087  0.0075723   0.09895925  0.13051635  0.25971898  0.09518899
  0.02625178 -0.12171331 -0.11473399 -0.08249989 -0.02871951  0.0420239
 -0.13584407 -0.09942883  0.10453323  0.00998436 -0.05356161 -0.04830276
  0.08323494  0.05484739]


In [9]:
sentence = ['only', 'you', 'can', 'prevent', 'forest', 'fires']
vec = 0
for element in sentence:
    element = model.infer_vector([element])
    vec = vec + element
    
vec=vec/len(sentence)
        
    

In [10]:
vec

array([-0.26727238, -0.02400274, -0.11539656,  0.0710511 , -0.03848986,
       -0.09373647,  0.09259338,  0.11842092, -0.1240021 , -0.09933812,
        0.0754253 , -0.2041409 , -0.00184114, -0.02438926, -0.13542247,
       -0.04856223,  0.13091236,  0.08527001,  0.00421309, -0.09346253,
        0.13401626,  0.07026599,  0.24547905,  0.00699505,  0.10448146,
       -0.02037976, -0.13552496, -0.00907391, -0.1268513 , -0.1397889 ,
        0.10850052, -0.00817207, -0.063591  ,  0.14053357, -0.03743366,
        0.01460632,  0.07227769, -0.06326079, -0.04836645,  0.01952797,
        0.10248665,  0.01550332, -0.06729565, -0.06067989,  0.17969568,
        0.06117888,  0.06062019, -0.1065082 ,  0.13902348,  0.08265901],
      dtype=float32)

In [11]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [12]:
second_ranks

[(48, 0.8950256109237671),
 (143, 0.7655128836631775),
 (21, 0.8799075484275818),
 (57, 0.6846529245376587),
 (33, 0.7159869074821472),
 (218, 0.6458190083503723),
 (17, 0.829204261302948),
 (25, 0.805980920791626),
 (33, 0.8963625431060791),
 (8, 0.7949551939964294),
 (264, 0.789124608039856),
 (188, 0.7263126969337463),
 (26, 0.6898002624511719),
 (292, 0.771701991558075),
 (277, 0.8778758645057678),
 (27, 0.8312704563140869),
 (59, 0.857148289680481),
 (6, 0.8438591361045837),
 (71, 0.694186270236969),
 (40, 0.7896546125411987),
 (150, 0.6165494918823242),
 (43, 0.8830140829086304),
 (179, 0.809225857257843),
 (56, 0.750128984451294),
 (62, 0.6348386406898499),
 (10, 0.8349882960319519),
 (12, 0.7767967581748962),
 (15, 0.8552736043930054),
 (56, 0.7624508738517761),
 (173, 0.8078106641769409),
 (149, 0.6805684566497803),
 (251, 0.7582486867904663),
 (21, 0.7382551431655884),
 (8, 0.8826751708984375),
 (12, 0.659411609172821),
 (127, 0.7091203927993774),
 (186, 0.5089327096939087),


In [13]:
train_corpus[4]

TaggedDocument(words=['six', 'midwives', 'have', 'been', 'suspended', 'at', 'wollongong', 'hospital', 'south', 'of', 'sydney', 'for', 'inappropriate', 'use', 'of', 'nitrous', 'oxide', 'during', 'work', 'hours', 'on', 'some', 'occasions', 'while', 'women', 'were', 'in', 'labour', 'the', 'illawarra', 'area', 'health', 'service', 'says', 'that', 'following', 'an', 'investigation', 'of', 'unprofessional', 'conduct', 'further', 'four', 'midwives', 'have', 'been', 'relocated', 'to', 'other', 'areas', 'within', 'the', 'hospital', 'the', 'service', 'chief', 'executive', 'officer', 'tony', 'sherbon', 'says', 'no', 'one', 'was', 'put', 'at', 'risk', 'because', 'other', 'staff', 'not', 'involved', 'in', 'the', 'use', 'of', 'nitrous', 'oxide', 'were', 'able', 'to', 'take', 'over', 'caring', 'for', 'women', 'in', 'labour', 'well', 'we', 're', 'very', 'concerned', 'and', 'the', 'body', 'of', 'midwives', 'to', 'the', 'hospital', 'there', 'are', 'over', 'midwives', 'that', 'work', 'in', 'our', 'servic

In [14]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 292, 1: 8})


In [15]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [16]:
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)


SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):



In [17]:
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


MOST (299, 0.957938551902771): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with hi

In [18]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (75): «us president george bush has marked the th day of the campaign against terrorism by calling on his allies to freeze the assets of two non us organisations suspected of supporting terrorism one of the groups is based in kashmir the other is alleged to have helped al qaeda develop nuclear weapons president bush says former scientist at pakistan atomic program had established group called utn after assisting osama bin laden network develop nuclear bomb utn claims to serve the hungry and needy of afghanistan but it was the utn that provided information about nuclear weapons to al qaeda he said he also linked kashmiri group to the attack on the indian parliament last week lat is an extremist group based in kashmir and is stateless sponsor of terrorism he said mr bush says the international financial crackdown has frozen million in terrorist assets»

Similar Document (160, 0.7649907469749451): «french moroccan man has been charged in the united states with conspiracy in

In [19]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (75, 0.7155953645706177): «us president george bush has marked the th day of the campaign against terrorism by calling on his allies to freeze the assets of two non us organisations suspected of supporting terrorism one of the groups is based in kashmir the other is alleged to have helped al qaeda develop nuclear weapons president bush says former scientist at pakistan atomic program had established group called utn after assisting osama bin laden network develop nuclear bomb utn claims to serve the hungry and needy of afghanistan but it was the utn that provided information about nuclear weapons to al qaeda he said he also linked kashmiri group to the attack on the indian parliament last week lat is an extremist group based in kashmir and is stateless sponsor of terrorism he said mr bush says the international financial crackdown has frozen million in terrorist assets»

MEDIAN (181, 0.3200138807296753): «t

### Challenge: Use the senator speeches in the folder 105-extracted-date and use doc2vec to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party).  Describe your findings. Compare with the outcome you got/will get using cosine similarity.

In [20]:
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html