In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Training
- copy-paste D2V code here. Try other models later.

In [2]:
import pandas as pd
df = pd.read_csv('data/train.csv', dtype=str)
df = df.reset_index()
df.head()

Unnamed: 0,index,venue,title,abstract,year,articledate,doi,pmid,pii,pmc,tiabs,covid,pid,authors,journal,journal-short,pubdate,issns,publisher
0,0,BMC research notes,The prevalence and correlates of social phobia...,\nSocial phobia is highly prevalent among univ...,2019,2019-07-19,10.1186/s13104-019-4482-y,31324266,10.1186/s13104-019-4482-y,PMC6642571,The prevalence and correlates of social phobia...,0,,,,,,,
1,1,Journal of chemical information and modeling,"Off-pocket Activity Cliffs, a Puzzling Facet o...",\nWhile accurate quantitative prediction of li...,2019,2019-12-02,10.1021/acs.jcim.9b00731,31790251,,,"Off-pocket Activity Cliffs, a Puzzling Facet o...",0,,,,,,,
2,2,"Methods in molecular biology (Clifton, N.J.)",Isolation and Characterization of Plant Metabo...,\nPseudomonas syringae is a bacterium that can...,2019,,10.1007/978-1-4939-9458-8_13,31041769,,,Isolation and Characterization of Plant Metabo...,0,,,,,,,
3,3,Pharmacogenomics,Pharmacogenetic testing in primary care practi...,\nAim: Although health authorities have set ph...,2019,,10.2217/pgs-2019-0004,31190623,,,Pharmacogenetic testing in primary care practi...,0,,,,,,,
4,4,Journal of cancer education : the official jou...,Correction to: Educational Opportunities for D...,\nThe original version of this article unfortu...,2019,,10.1007/s13187-019-01616-0,31515717,10.1007/s13187-019-01616-0,,Correction to: Educational Opportunities for D...,0,,,,,,,


# Pre-process
Different algorithms and different text corpora call for different kinds of pre-processing. What we are doing here is trying to make the text as easy as possible for the computer to 'understand' without destroying any valuable features in the data.

In [3]:
from gensim.models import TfidfModel, Phrases
from gensim.models.phrases import Phraser
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim import corpora
from gensim import utils

## Ngramming
We will start by building Ngrammers. These are very simple models which learn when words tend to appear together. The typical example here is 'New York', which is 2 words, but which we treat as one. Is therefore a 'bigram'. We might also be interested in 'New York City', which is 3 words that often appear together, so that's a "trigram".

We will run 2 processes where we first search our text for bigrams and then trigrams. Essentially, it's just the same process run twice.

In [4]:
bigram_phraserpath = 'models/bigram_d2v_pubmed'
trigram_phraserpath = 'models/trigram_d2v_pubmed'

CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    strip_multiple_whitespaces,
                    # strip_short, #(minsize=2),
                    ] # stem or lemmatize?

# define Iterator

# retrieve each text entry
def extract_text():

    for i, row in df.iterrows():
        doi = str(row['doi'])
        textdata = str(row['tiabs'])
        textdata = preprocess_string(textdata, CUSTOM_FILTERS)
        yield doi, textdata
#         if i%10000==0:
#             print(i, 'iterations done')


# iteration 1
print()
print('Training bigram detection')
documents = extract_text()
phrases = Phrases(
                (document[1]
                 for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    common_terms = ["of", "with", "without",
                                "and", "or", "the", "a"]
                    # max_vocab_size = 1000000
                    )
bigram = Phraser(phrases)
bigram.save(bigram_phraserpath)


# iteration 2
print()
print('Training trigram detection')
documents = extract_text()
phrases = Phrases((bigram[document[1]]
                    for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    # max_vocab_size = 2000000
                    )
trigram = Phraser(phrases)
# save model
trigram.save(trigram_phraserpath)
print('trigrams trained and saved')


Training bigram detection

Training trigram detection
trigrams trained and saved


In [5]:
# pre process for d2v
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim.models import Phrases


bigram = Phrases.load(bigram_phraserpath)
trigram = Phrases.load(trigram_phraserpath)

def pre_d2v_search(s, bigram, trigram):
    
    CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    strip_multiple_whitespaces] 
    
    return trigram[bigram[preprocess_string(s,filters= CUSTOM_FILTERS)]]

# Train

In [6]:
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.summarization.textcleaner import get_sentences
from gensim.utils import tokenize
import multiprocessing

# Preparing the corpus

In [7]:
# df = df.reset_index()

In [8]:
df.shape

(1022452, 20)

This class will allow us to load our data into the doc2vec model.

In [9]:
class ADTaggedDocument(object):
    def __iter__(self):
        
        for i,row in df.iterrows():
            id_ = str(row['index'])
#             doi = str(row['doi'])
            text_data = str(row['tiabs'])
            tokenized = pre_d2v_search(text_data, bigram, trigram)
            yield TaggedDocument(tokenized, [id_])        

This is where we train the doc2vec model. 

Estimate ~4hrs for training with 7 workers

In [10]:
# COMMENT/UNCOMMENT FOR TRAINING

cores = multiprocessing.cpu_count()

documents = ADTaggedDocument()
# model.build_vocab(documents)

n_features = 300

model = Doc2Vec(documents = documents,
                    dm=0, 
                    dbow_words=1, 
                    vector_size=n_features, 
                    window=10, 
                    min_count=7, 
                    epochs=8, 
                    workers= cores-1
#                     alpha=0.06,  # comment these lines to use default alpha
#                     min_alpha=0.04
                   )

# re-initialise generator
documents = ADTaggedDocument()
# Now train Doc2Vec on the corpus
model.train(documents, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)


model.save(os.path.join('models', 'd2v_pubmed.model'), separately=None)

In [11]:
model = Doc2Vec.load('models/d2v_pubmed.model')
model

<gensim.models.doc2vec.Doc2Vec at 0x1c0f1580548>

Now we transform our corpus of text data using the doc2vec model to give us a 'd2v_corpus'.  We can visualise this data to get a feel for the dataset.

In [12]:
import numpy as np
from gensim.matutils import any2sparse


# This should be done with batch/vector process. It's SLOW!

def iter_df(df):
    for i, row in df.iterrows():
#         doi = str(row['doi'])
        id_ = str(row['index'])
#             doi = str(row['doi'])
        textdata = str(row['tiabs'])
        ngrams = pre_d2v_search(textdata, bigram, trigram)
        yield id_, ngrams


def iter_arts(df):
    articles = iter_df(df)
    for article in articles:
        textdata = article[1]
        vec = model.infer_vector(textdata) 
        vec = any2sparse(vec, eps=1e-09)
#         print(np.shape(vec))
        yield vec
        



In [13]:
from gensim import corpora, similarities
transformed = corpora.MmCorpus.serialize('data/d2v_corpus_pubmed.mm', iter_arts(df))

In [14]:
# Use this to check the model
index_d2v = similarities.Similarity('data/d2v_sims_pubmed.index',
                                      corpora.MmCorpus('data/d2v_corpus_pubmed.mm'),
                                      num_features=300)
index_d2v.save()

In [70]:
sims

[('54', 1.0),
 ('1013896', 0.6919227838516235),
 ('415028', 0.6909884214401245),
 ('1014397', 0.6852828860282898),
 ('59599', 0.6844850778579712),
 ('544497', 0.6833879947662354),
 ('65363', 0.6744802594184875),
 ('864855', 0.6735906600952148),
 ('769358', 0.6709409356117249),
 ('71265', 0.6705315113067627)]

In [75]:
row_number = 84
df.iloc[row_number]['title']

'A novel dataset on a culture of cooperation and inclusive political institutions in 90 European historical regions observed between 1000 and 1600.'

In [76]:
query_vec = model.docvecs[row_number]
# similar articles
sims = model.docvecs.most_similar([query_vec])
sims_inds = [x[0] for x in sims] #np.argsort(sims)[::-1]
cols = ['doi','title']
df[cols].iloc[sims_inds]['title'].tolist()

['A novel dataset on a culture of cooperation and inclusive political institutions in 90 European historical regions observed between 1000 and 1600.',
 'Inclusive growth: A dataset on key and institutional foundations for inclusive development of Russian regions.',
 'Feeling the Squeeze: Nonmarket Institutional Pressures and Firm Nonmarket Strategies.',
 'Navigating through the Mundellian Trilemma: A dataset of four decades.',
 'The interdependence of corporate reputation and ownership: a network approach to quantify reputation.',
 'The glocalization of antimicrobial stewardship.',
 'Macroeconomic dataset for comparative studies on coastal and inland regions in innovation space of Russia.',
 'The credibility of scientific communication sources regarding climate change: A population-based survey experiment.',
 'Investigating intergroup attitudes in Europe: Cross-national data on news media, attitudes towards newcomers, and socio-psychological indicators.',
 'Measuring sustainable develo

In [19]:
# corpora.MmCorpus.serialize('data/d2v_corpus_pubmed.mm', iter_arts(df))

# vectorise test and dev sets

In [17]:
test = pd.read_csv('data/test.csv', dtype = str).reset_index()
corpora.MmCorpus.serialize('data/d2v_test_corpus_pubmed.mm', iter_arts(test))
dev = pd.read_csv('data/dev.csv', dtype = str).reset_index()
corpora.MmCorpus.serialize('data/d2v_dev_corpus_pubmed.mm', iter_arts(dev))

In [44]:
all_ = pd.read_csv('data/all_s2_pubmed.csv', dtype = str).reset_index()
corpora.MmCorpus.serialize('data/d2v_all_corpus_pubmed.mm', iter_arts(all_))

# Dimensional reduction for visualisation

In [54]:
high_dim = corpora.MmCorpus('data/d2v_all_corpus_pubmed.mm')
high_dim

<gensim.corpora.mmcorpus.MmCorpus at 0x1bfd03b0ee8>

In [55]:
n_features = 300

In [56]:
%%time
from gensim.corpora.mmcorpus import MmCorpus
from gensim.matutils import corpus2dense

X = corpus2dense(high_dim, num_terms = n_features).T
X.shape

(1032546, 300)

# pre-uMap
This is a fast way to shrink our dataset to make it more manageable for Umap

In [57]:
%%time
# pick an intermediate dimensionality to start off dimensional reduction, then let uMap do the rest
from sklearn.decomposition import TruncatedSVD
X_reduced = TruncatedSVD(n_components = 50,
                            random_state = 0).fit_transform(X)
X_reduced.shape

(1032546, 50)

# uMap

In [58]:
%%time
import umap
import warnings
warnings.filterwarnings('ignore')

trans = umap.UMAP(
#                 n_neighbors=5,
#                   min_dist=0.1,
#                   metric='correlation'
                 ).fit(X_reduced)

Wall time: 31min 8s


In [59]:
trans.embedding_.shape

(1032546, 2)

In [60]:
np.save('data/2d_s2_pubmed_d2v_corpus.npy',trans.embedding_)