In [1]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [2]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [3]:
#train_corpus
#test_corpus

In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [5]:
model.build_vocab(train_corpus)

In [6]:
print(f"Word 'australia' appeared {model.wv.get_vecattr('australia', 'count')} times in the training corpus.")

Word 'australia' appeared 157 times in the training corpus.


In [7]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.20641747 -0.23493986 -0.08681685  0.22472623  0.03386387 -0.01482389
  0.01273163  0.05833967 -0.22107083 -0.08874122  0.20304887 -0.13378382
 -0.08444473 -0.00545625 -0.17427847 -0.12953363  0.13086383  0.32183775
  0.18566275 -0.12808643  0.02608993 -0.06972271  0.13313249 -0.10909095
  0.08539014 -0.06988473 -0.2691232  -0.07086591 -0.18284194 -0.12230893
  0.33700952 -0.0725226   0.2326886   0.09040855  0.21610518  0.00127633
 -0.03189853 -0.26475048 -0.0666502  -0.00374227 -0.14070167 -0.06916639
 -0.01841691 -0.06405579  0.1410443  -0.09293656 -0.01960086 -0.07635256
  0.13503839  0.02712352]


In [9]:
sentence = ['only', 'you', 'can', 'prevent', 'forest', 'fires']
vec = 0
for element in sentence:
    element = model.infer_vector([element])
    vec = vec + element
    
vec=vec/len(sentence)
        
    

In [10]:
vec

array([-0.2801692 , -0.04768342, -0.10730311, -0.00195926, -0.05536876,
       -0.05625186,  0.04048744,  0.08254982, -0.14010793, -0.09819213,
        0.05985992, -0.16058113, -0.00455775, -0.03691265, -0.11699343,
       -0.00666013,  0.14785573,  0.13959555, -0.00341086, -0.1347454 ,
        0.13135533,  0.04664059,  0.21051414, -0.07372337,  0.16820979,
       -0.03975403, -0.18914346, -0.03071375, -0.11522766, -0.12680624,
        0.09716287, -0.04009215, -0.01156241,  0.09220479, -0.02174257,
        0.07317208,  0.1020329 , -0.07566821,  0.03784954,  0.01147579,
        0.08291984,  0.02483455, -0.09749395, -0.04941022,  0.20798393,
        0.05433756,  0.09135882, -0.10221431,  0.13630413,  0.01286941],
      dtype=float32)

In [11]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [12]:
second_ranks

[(48, 0.8894424438476562),
 (143, 0.7728396058082581),
 (21, 0.83701491355896),
 (66, 0.662865161895752),
 (33, 0.7096188068389893),
 (84, 0.6342315077781677),
 (17, 0.7935008406639099),
 (15, 0.8390064239501953),
 (48, 0.8982731699943542),
 (8, 0.8013573884963989),
 (264, 0.8030199408531189),
 (188, 0.7259587049484253),
 (26, 0.6875147819519043),
 (292, 0.7641969323158264),
 (277, 0.8702235817909241),
 (39, 0.8304497599601746),
 (55, 0.8592981100082397),
 (6, 0.8544087409973145),
 (2, 0.6973508596420288),
 (40, 0.7800583839416504),
 (150, 0.6638279557228088),
 (2, 0.8808373212814331),
 (179, 0.7735890746116638),
 (56, 0.7653254866600037),
 (188, 0.6759280562400818),
 (10, 0.8551639914512634),
 (12, 0.7728879451751709),
 (15, 0.8890668153762817),
 (56, 0.745710015296936),
 (173, 0.7765018939971924),
 (121, 0.7068313360214233),
 (251, 0.7056076526641846),
 (258, 0.7453994750976562),
 (8, 0.871019184589386),
 (12, 0.6446264982223511),
 (127, 0.7404513359069824),
 (224, 0.4976302683353424

In [13]:
train_corpus[4]

TaggedDocument(words=['six', 'midwives', 'have', 'been', 'suspended', 'at', 'wollongong', 'hospital', 'south', 'of', 'sydney', 'for', 'inappropriate', 'use', 'of', 'nitrous', 'oxide', 'during', 'work', 'hours', 'on', 'some', 'occasions', 'while', 'women', 'were', 'in', 'labour', 'the', 'illawarra', 'area', 'health', 'service', 'says', 'that', 'following', 'an', 'investigation', 'of', 'unprofessional', 'conduct', 'further', 'four', 'midwives', 'have', 'been', 'relocated', 'to', 'other', 'areas', 'within', 'the', 'hospital', 'the', 'service', 'chief', 'executive', 'officer', 'tony', 'sherbon', 'says', 'no', 'one', 'was', 'put', 'at', 'risk', 'because', 'other', 'staff', 'not', 'involved', 'in', 'the', 'use', 'of', 'nitrous', 'oxide', 'were', 'able', 'to', 'take', 'over', 'caring', 'for', 'women', 'in', 'labour', 'well', 'we', 're', 'very', 'concerned', 'and', 'the', 'body', 'of', 'midwives', 'to', 'the', 'hospital', 'there', 'are', 'over', 'midwives', 'that', 'work', 'in', 'our', 'servic

In [14]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 291, 1: 9})


In [15]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [16]:
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)


SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):



In [17]:
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


MOST (299, 0.9433698058128357): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with h

In [18]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (92): «federal labor mp carmen lawrence says there is lot of momentum within the party for the alp to change its policy on asylum seekers dr lawrence says maintaining the policy will lose the sympathy of some sections of the community who have thought very carefully about the issues she says it will also annoy others who supported the coalition stance and see the alp as compromised the member for fremantle says labor did not suffer in the polls after it differentiated itself from the coalition in and we committed to native title we refused the extinguishment options that howard put forward she said we indicated our willingness to give an official apology on behalf of the nation to the stolen generations and we didn lose single vote in fact we came the nearest to winning an election after having been nearly obliterated in»

Similar Document (64, 0.7649794220924377): «high profile church leader says the governor general must clarify his statement defending his handling of 

In [19]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (5): «gay former student of melbourne christian school is taking legal action under equal opportunity legislation claiming the school discriminated against him because of his sexuality tim alleged staff member at hillcrest christian college in berwick told him he had the devil in him and constant bullying by students prompted the principal to tell him to hide his sexuality he left the school several weeks ago and is continuing year by distance education after he said homophobic bullies threw rocks at his head spat on him called him names and slashed his belongings»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (23, 0.5700494050979614): «americans fears about airplane security continue to increase after man made it through two separate flights with loaded gun in his carry on luggage the man was finally stopped before boarding third plane in memphis the man had travelled from florida to atlanta and then atlanta to memphis he was attempting 

### Challenge: Use the senator speeches in the folder 105-extracted-date and use cosine similarity to find whose senator speech is closest to senator Biden. Use sen105kh_fix.csv and/or Wikipedia to validate your findings (i.e., understand if the most similar speeches are senators from the same state and/party).  Describe your findings. (Compare with the outcome you got using cosine similarity.)

In [20]:
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html