In [1]:
import pickle
import os

import doc2vec
import scipy.spatial.distance

import rank_metrics
import numpy as np

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # cancel optimization complaints by tensorflow

In [3]:
raw_data_folder = 'original_articles'
rel_labels_fname = 'relevance_labels_' + raw_data_folder + '.p'
doc2vec_model_folder = 'd2v.model/'

distance_measure = 'cosine'

In [42]:
"""
Get relevance labels
"""

if not os.path.isfile(rel_labels_fname):
    print('Computing relevance labels using bm25... (this takes a while, but should only be done once..)')
    import prepare_relevance_labels

    prepare_relevance_labels.prepare_relevance_labels(output_fname=rel_labels_fname, folder=raw_data_folder)

with open(rel_labels_fname, 'rb') as f:
    source_dict, docs, doc_names, tokenized, bm25_scores, sorted_bm25_indices = pickle.load(f)

In [55]:
"""
Train paragraph vectors
"""

# Always train, because restoring does not work
if not False:  # os.path.exists(doc2vec_model_folder):
    print('Initializing and training paragraph vectors')
    d2v = doc2vec.Doc2Vec(batch_size=128,
                          window_size=8,
                          concat=True,
                          architecture='pvdm',
                          embedding_size_w=128,  # word embedding size
                          embedding_size_d=128,  # document embeding size
                          vocabulary_size=50000,
                          document_size=len(docs),
                          loss_type='sampled_softmax_loss',
                          n_neg_samples=64,
                          optimize='Adagrad',
                          learning_rate=1.0,
                          n_steps=100001  # 100001
                          )

    d2v.fit(tokenized)
    d2v.save(doc2vec_model_folder)
else:
    d2v = doc2vec.Doc2Vec.restore(doc2vec_model_folder)

Initializing and training paragraph vectors
Initialized
Average loss at step 0: 8.167775
Average loss at step 2000: 46.125775
Average loss at step 4000: 16.784407
Average loss at step 6000: 8.508686
Average loss at step 8000: 5.286444
Average loss at step 10000: 4.072209
Average loss at step 12000: 2.957042
Average loss at step 14000: 2.510528
Average loss at step 16000: 2.194513
Average loss at step 18000: 1.925601
Average loss at step 20000: 1.723574
Average loss at step 22000: 1.474024
Average loss at step 24000: 1.322927
Average loss at step 26000: 1.250167
Average loss at step 28000: 1.187439
Average loss at step 30000: 1.101452
Average loss at step 32000: 0.994075
Average loss at step 34000: 0.939118
Average loss at step 36000: 0.908325
Average loss at step 38000: 0.884198
Average loss at step 40000: 0.804825
Average loss at step 42000: 0.757928
Average loss at step 44000: 0.738213
Average loss at step 46000: 0.738690
Average loss at step 48000: 0.703040
Average loss at step 5000

In [56]:
"""
Evaluate
"""
distances = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(d2v.doc_embeddings, metric=distance_measure))

results = []
hits = []
sorted_d2v_distance_indices = []
for doc_index, distance in enumerate(distances):
    sorted_distance_indices = sorted(range(len(distance)), key=lambda x: distance[x], reverse=False)
    relevance_set = set(sorted_bm25_indices[doc_index][:10])
    hit = np.array([ix in relevance_set for ix in sorted_distance_indices[:10]], dtype=int)
    average_precision = rank_metrics.average_precision(hit)
    ndcg_at_10 = rank_metrics.ndcg_at_k(hit, 10)
    sorted_d2v_distance_indices.append(sorted_distance_indices)    
    hits.append(hit)
    results.append({
        'average_precision': average_precision,
        'ndcg_at_10': ndcg_at_10,
    })

print("MAP: ", rank_metrics.mean_average_precision(hits))
print("MRR: ", rank_metrics.mean_reciprocal_rank(hits))

MAP:  0.650890293018
MRR:  0.707144396718


NDIMS = 64

MAP:  0.646
MRR:  0.711

NDIMS = 128

MAP:  0.651
MRR:  0.707

In [13]:
import pandas as pd

In [47]:
df = pd.DataFrame(results)
print(df)

      average_precision  ndcg_at_10
0              1.000000    1.000000
1              1.000000    1.000000
2              0.000000    0.000000
3              0.835000    0.916443
4              1.000000    1.000000
5              1.000000    1.000000
6              1.000000    1.000000
7              1.000000    1.000000
8              1.000000    1.000000
9              0.700000    0.715338
10             0.000000    0.000000
11             0.600000    0.650515
12             1.000000    1.000000
13             1.000000    1.000000
14             0.666667    0.717181
15             0.755556    0.783604
16             0.833333    0.815465
17             0.750000    0.750000
18             0.750000    0.750000
19             0.750000    0.868811
20             1.000000    1.000000
21             1.000000    1.000000
22             0.750000    0.750000
23             0.142857    0.356207
24             0.750000    0.750000
25             1.000000    1.000000
26             0.622024    0

In [57]:
def print_result_difference(ix, number_of_articles=2):
    print("Source document:")
    print(docs[ix], '\nMost similar (BM25) documents:\n' + 50*'-')
    for i in sorted_bm25_indices[ix][:number_of_articles]:
        print(i)
        print(docs[i][:300] + '...')
        print(25*'-')
    print('\nClosest Doc2Vec documents:\n' + 50*'-')
    for i in sorted_d2v_distance_indices[ix][:number_of_articles]:
        print(i)
        print(docs[i][:300] + '....')
        print(25*'-')
print_result_difference(300)

Source document:
Article 37
Deduction of intangible assets
Institutions shall determine the amount of intangible assets to
be deducted in accordance with the following:
(a) the amount to be deducted shall be reduced by the amount
of associated deferred tax liabilities that would be extinguished if the intangible assets became impaired or were
derecognised under the applicable accounting framework;
(b)	the amount to be deducted shall include goodwill included
in the valuation of significant investments of the institution.
J1) OJ L 331, 15.12.2010, p. 48.
27.6.2013
Official Journal of the European Union
 
Most similar (BM25) documents:
--------------------------------------------------
333
﻿Article 4
Definitions
4.1 For the purposes of this Regulation, the following definitions shall apply:
4.1.1 'credit institution' means an undertaking the business of which is to take deposits or other repayable funds from the public and to grant credits for its own account;
4.1.2 'investment firm' ...