In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [5]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [6]:
print(train_corpus[:2])

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [3]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

2021-05-18 16:51:23,614 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3)', 'datetime': '2021-05-18T16:51:23.599281', 'gensim': '4.0.1', 'python': '3.9.5 (default, May 12 2021, 17:14:51) \n[GCC 10.2.0]', 'platform': 'Linux-5.10.36-1-lts-x86_64-with-glibc2.33', 'event': 'created'}


In [7]:
model.build_vocab(train_corpus)

2021-05-18 16:52:21,906 : INFO : collecting all words and their counts
2021-05-18 16:52:21,909 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-05-18 16:52:21,937 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words
2021-05-18 16:52:21,938 : INFO : Creating a fresh vocabulary
2021-05-18 16:52:21,953 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3955 unique words (56.653774530869505%% of original 6981, drops 3026)', 'datetime': '2021-05-18T16:52:21.953840', 'gensim': '4.0.1', 'python': '3.9.5 (default, May 12 2021, 17:14:51) \n[GCC 10.2.0]', 'platform': 'Linux-5.10.36-1-lts-x86_64-with-glibc2.33', 'event': 'prepare_vocab'}
2021-05-18 16:52:21,954 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 55126 word corpus (94.79639565277205%% of original 58152, drops 3026)', 'datetime': '2021-05-18T16:52:21.954418', 'gensim': '4.0.1', 'python': '3.9.5 (default, May 1

In [8]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2021-05-18 17:16:59,471 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5', 'datetime': '2021-05-18T17:16:59.471609', 'gensim': '4.0.1', 'python': '3.9.5 (default, May 12 2021, 17:14:51) \n[GCC 10.2.0]', 'platform': 'Linux-5.10.36-1-lts-x86_64-with-glibc2.33', 'event': 'train'}
2021-05-18 17:16:59,503 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-18 17:16:59,505 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-18 17:16:59,506 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-18 17:16:59,506 : INFO : EPOCH - 1 : training on 58152 raw words (42641 effective words) took 0.0s, 1298071 effective words/s
2021-05-18 17:16:59,536 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-18 17:16:59,537 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-18 17:16:59,540 :

2021-05-18 17:17:00,179 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-18 17:17:00,182 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-18 17:17:00,182 : INFO : EPOCH - 20 : training on 58152 raw words (42716 effective words) took 0.0s, 1113213 effective words/s
2021-05-18 17:17:00,211 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-18 17:17:00,211 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-18 17:17:00,212 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-18 17:17:00,212 : INFO : EPOCH - 21 : training on 58152 raw words (42664 effective words) took 0.0s, 1508276 effective words/s
2021-05-18 17:17:00,241 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-18 17:17:00,242 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-18 17:17:00,243 : INFO : worker thread finished; awaiting finish of 0 more threads
20

2021-05-18 17:17:00,902 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-18 17:17:00,903 : INFO : EPOCH - 40 : training on 58152 raw words (42810 effective words) took 0.0s, 1259764 effective words/s
2021-05-18 17:17:00,903 : INFO : Doc2Vec lifecycle event {'msg': 'training on 2326080 raw words (1707942 effective words) took 1.4s, 1193430 effective words/s', 'datetime': '2021-05-18T17:17:00.903429', 'gensim': '4.0.1', 'python': '3.9.5 (default, May 12 2021, 17:14:51) \n[GCC 10.2.0]', 'platform': 'Linux-5.10.36-1-lts-x86_64-with-glibc2.33', 'event': 'train'}


In [9]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.06254477 -0.36724    -0.1641068   0.2995792  -0.02758917  0.00927021
  0.03412568  0.06726772 -0.24907838 -0.19675067  0.1764454  -0.05799392
  0.14722575 -0.01547862 -0.15040639 -0.12969983  0.20082991  0.1883212
  0.15750109 -0.11916596  0.07106131  0.00078256  0.14093363  0.06893362
 -0.04631678  0.00163593 -0.26148     0.00263071 -0.08587924 -0.06802563
  0.32831028  0.01964421  0.0878148   0.08372569  0.20954423  0.02907191
  0.02235077 -0.33345962 -0.1809128   0.00408299  0.04636257  0.11377327
 -0.01499281 -0.08249711  0.19684558  0.01742872 -0.0421395  -0.07634279
  0.14602993 -0.01136561]


In [22]:
def calculate_section_vecs(paper):
    for section in paper['metadata']['sections']:
        tokens = gensim.utils.simple_preprocess(section['text'])
        vector = model.infer_vector(tokens)
        yield (section['heading'], vector)

In [26]:
import os
import json

# requires json.zip extracted here
workdir = 'data/json/'
    
for file in os.listdir(workdir):
    if file.endswith('.json'): 
        
        with open(workdir + filename, 'r') as f:
            paper = json.load(f)
            
            results = list(calculate_section_vecs(paper))
            print(results)
            
            break

[(None, array([-0.38778764, -0.12247083, -0.29450744,  0.19985287, -0.5121317 ,
       -0.24861136,  0.27444324,  0.50417316, -0.5094555 , -0.42838907,
        0.3081453 ,  0.05583055,  0.06028468,  0.22066891, -0.2727069 ,
        0.27892485,  0.50654536,  0.47816873, -0.1375556 , -0.39735243,
        0.2050223 ,  0.08945337,  0.45610517,  0.0589187 ,  0.29514197,
       -0.14677612, -0.17572084,  0.11952075, -0.40394396, -0.04899392,
        0.64434224,  0.14482602,  0.13992088,  0.20503019,  0.15900853,
        0.12069197,  0.36746716, -0.37362576,  0.11920392, -0.2283158 ,
       -0.11095075,  0.10898604, -0.14514354, -0.3348022 ,  0.2153846 ,
        0.07488101,  0.06118841, -0.37967762,  0.59673464,  0.3301335 ],
      dtype=float32)), ('1 Introduction', array([-2.4160764 , -0.74481183, -0.77510464,  0.09194495, -0.03807896,
        1.0588412 ,  0.6663507 , -0.6561951 ,  0.20642145,  1.0805727 ,
       -0.83305454, -1.8051146 ,  0.6624961 , -0.7236418 ,  0.31298396,
        0.048