# Vectorisation (turning text into numbers)
There are a lot of ways to turn text into numbers. Here, we will use Doc2Vec, but other algorithms do this well including BERT, LDA, LSI etc.

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in our data
df = pd.read_csv('data/s2_cr_data.csv', dtype=str)
df.shape

(12617, 13)

In [3]:
# df.sample(10)

Unnamed: 0,pid,doi,title,abstract,authors,venue,year,tiabs,journal,journal-short,pubdate,issns,publisher
8439,98e6190d06a8d8774c690be39634a1d64a926dc3,10.3390/cells8040377,Genetic Diversity and Differentiation at Struc...,Investigating adaptive potential and understan...,"Lan, Hong; Zhou, Tong; Wan, Qiu-Hong; Fang, Sh...",Cells,2019.0,Genetic Diversity and Differentiation at Struc...,Cells,Cells,2019-04-25,2073-4409,MDPI AG
9557,27c190c3d4fc0884a3b9d79774ee61a0c305c540,,"In Vitro Membrane Assembly of a Polytopic, Tra...","In vitro integration of the polytopic, transme...","Ahrem, Barbara; Hoffschulte, Hedda; Miiller, M...",,,"In Vitro Membrane Assembly of a Polytopic, Tra...",,,,,
884,2e2de8d6ebd368c062fed5770aa26a780bdc25b8,10.1186/1471-2334-6-82,The interferon gamma gene polymorphism +874 A/...,BackgroundCytokines play important roles in an...,"Po, Wai; †1, Chong; Ip, Wk; Hoi, Gloria; Tso, ...",BMC infectious diseases,2006.0,The interferon gamma gene polymorphism +874 A/...,BMC Infectious Diseases,BMC Infect Dis,2006-05-04,1471-2334,Springer Science and Business Media LLC
10970,1cfcdcf2a3783b9adba2b091bd50c5d8ac3252ff,,International Journal of Nanomedicine Dovepres...,Influenza virus infections are a major public ...,"Jack Hu, Che-Ming; Chen, You-Ting; Fang, Zih-S...",,,International Journal of Nanomedicine Dovepres...,,,,,
12072,504684906ab86a5130b59c2557ec73e2226f67e7,,,B ats are a major source of zoonotic viruses w...,,,,B ats are a major source of zoonotic viruses w...,,,,,
8906,04510f144938a17bf2c2a9683ce3479ff0246b94,10.1186/s12967-019-2077-y,The protective role of microRNA-21 against cox...,BackgroundThe P38 mitogen-activated protein ki...,"He, Feng; Xiao, Zonghui; Yao, Hailan; Li, Sen;...",Journal of Translational Medicine,2019.0,The protective role of microRNA-21 against cox...,Journal of Translational Medicine,J Transl Med,2019-10-04,1479-5876,Springer Science and Business Media LLC
8000,350bf76f2ecfc9322c9c8b59af3b7a14a6e51c0f,10.1007/s11250-018-1668-6,Analysis of pig trading networks and practices...,East Africa is undergoing rapid expansion of p...,"Atherstone, C; Galiwango, R; Grace, &amp;; Alo...",Tropical Animal Health and Production,2018.0,Analysis of pig trading networks and practices...,Tropical Animal Health and Production,Trop Anim Health Prod,2018-08-02,0049-4747; 1573-7438,Springer Science and Business Media LLC
10612,9d12f3a201435c9d24dc9b356283edf8bd3a234d,10.4110/in.2017.17.3.192,Exacerbation of Japanese Encephalitis by CD11c...,Japanese encephalitis (JE) is neuroinflammatio...,"Choi, Jin; Kim, Jin; Patil, Ajit; Bum, Seong; ...",Immune network,2017.0,Exacerbation of Japanese Encephalitis by CD11c...,Immune Network,Immune Netw,2017-06-26,1598-2629; 2092-6685,The Korean Association of Immunobiologists (KA...
4856,5f7b5b1f4748ede29130df5606fd51f4820edc20,10.1038/srep20918,Conformational Flexibility of a Short Loop nea...,"The SARS 3C-like proteinase (SARS-3CLpro), whi...","Li, Chunmei; Teng, Xin; Qi, Yifei; Tang, Bo; S...",Scientific reports,2016.0,Conformational Flexibility of a Short Loop nea...,,,,,
3736,e49106784cd05d905c10780cc5dbe2a10a6badb7,10.1371/journal.ppat.1004431,Correction: Infection with MERS-CoV Causes Let...,The incorrect version of Figure 6 is published...,,,2014.0,Correction: Infection with MERS-CoV Causes Let...,PLoS Pathogens,PLoS Pathog,2014-09-09,1553-7374,Public Library of Science (PLoS)


# Pre-process
Different algorithms and different text corpora call for different kinds of pre-processing. What we are doing here is trying to make the text as easy as possible for the computer to 'understand' without destroying any valuable features in the data.

In [4]:
from gensim.models import TfidfModel, Phrases
from gensim.models.phrases import Phraser
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim import corpora
from gensim import utils

## Ngramming
We will start by building Ngrammers. These are very simple models which learn when words tend to appear together. The typical example here is 'New York', which is 2 words, but which we treat as one. Is therefore a 'bigram'. We might also be interested in 'New York City', which is 3 words that often appear together, so that's a "trigram".

We will run 2 processes where we first search our text for bigrams and then trigrams. Essentially, it's just the same process run twice.

In [4]:
bigram_phraserpath = 'models/bigram_d2v'
trigram_phraserpath = 'models/trigram_d2v'

CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    remove_stopwords, # REMOVE THIS IF YOU ARE DOING D2V BIGRAMMING!
                    strip_multiple_whitespaces,
                    # strip_short, #(minsize=2),
                    stem_text # REMOVE THIS IF YOU ARE DOING D2V BIGRAMMING!
                    ] # stem or lemmatize?

# define Iterator

# retrieve each text entry
def extract_text():

    for i, row in df.iterrows():
        doi = str(row['doi'])
        textdata = str(row['tiabs'])
        textdata = preprocess_string(textdata, CUSTOM_FILTERS)
        yield doi, textdata
        if i%1000==0:
            print(i, 'iterations done')


# iteration 1
print()
print('Training bigram detection')
documents = extract_text()
phrases = Phrases(
                (document[1]
                 for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    common_terms = ["of", "with", "without",
                                "and", "or", "the", "a"]
                    # max_vocab_size = 1000000
                    )
bigram = Phraser(phrases)
bigram.save(bigram_phraserpath)


# iteration 2
print()
print('Training trigram detection')
documents = extract_text()
phrases = Phrases((bigram[document[1]]
                    for document in documents),
                    # min_count = 10,
                    threshold = 10,
                    # max_vocab_size = 2000000
                    )
trigram = Phraser(phrases)
# save model
trigram.save(trigram_phraserpath)
print('trigrams trained and saved')


Training bigram detection
0 iterations done
10000 iterations done

Training trigram detection
0 iterations done
10000 iterations done
trigrams trained and saved


In [5]:
# pre process for d2v
from gensim.utils import deaccent
from gensim.parsing.preprocessing import strip_short, strip_punctuation, remove_stopwords, strip_multiple_whitespaces, stem_text
from gensim.parsing import preprocess_string
from gensim.models import Phrases


bigram = Phrases.load(bigram_phraserpath)
trigram = Phrases.load(trigram_phraserpath)

def pre_d2v_search(s, bigram, trigram):
    
    CUSTOM_FILTERS = [lambda x: x.lower(),
                    deaccent,
                    strip_punctuation,
                    strip_multiple_whitespaces] 
    
    return trigram[bigram[preprocess_string(s,filters= CUSTOM_FILTERS)]]

# Train

In [6]:
import os
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.summarization.textcleaner import get_sentences
from gensim.utils import tokenize
import multiprocessing

# Preparing the corpus

In [7]:
df = df.reset_index()

In [8]:
df.shape

(12617, 14)

This class will allow us to load our data into the doc2vec model.

In [9]:
class ADTaggedDocument(object):
    def __iter__(self):
        
        for i,row in df.iterrows():
            id_ = str(row['index'])
#             doi = str(row['doi'])
            text_data = str(row['tiabs'])
            tokenized = pre_d2v_search(text_data, bigram, trigram)
            yield TaggedDocument(tokenized, [id_])        

This is where we train the doc2vec model. 

In [10]:
# COMMENT/UNCOMMENT FOR TRAINING

cores = multiprocessing.cpu_count()

documents = ADTaggedDocument()
# model.build_vocab(documents)

model = Doc2Vec(documents = documents,
                    dm=0, 
                    dbow_words=1, 
                    vector_size=300, 
                    window=8, 
                    min_count=6, 
                    epochs=8, 
                    workers= cores-1
#                     alpha=0.06,  # comment these lines to use default alpha
#                     min_alpha=0.04
                   )

# re-initialise generator
documents = ADTaggedDocument()
# Now train Doc2Vec on the corpus
model.train(documents, 
            total_examples=model.corpus_count, 
            epochs=model.epochs)


model.save(os.path.join('models', 'd2v.model'), separately=None)

In [11]:
model = Doc2Vec.load('models/d2v.model')
model

<gensim.models.doc2vec.Doc2Vec at 0x2a1242dac88>

Now we transform our corpus of text data using the doc2vec model to give us a 'd2v_corpus'.  We can visualise this data to get a feel for the dataset.

In [12]:
import numpy as np
from gensim.matutils import any2sparse

def iter_df(df):
    for i, row in df.iterrows():
#         doi = str(row['doi'])
        id_ = str(row['index'])
#             doi = str(row['doi'])
        textdata = str(row['tiabs'])
        ngrams = pre_d2v_search(textdata, bigram, trigram)
        yield id_, ngrams


def iter_arts(df):
    articles = iter_df(df)
    for article in articles:
        textdata = article[1]
        vec = model.infer_vector(textdata) # can't this be done as a batch/vector process? SLOW!
        vec = any2sparse(vec, eps=1e-09)
#         print(np.shape(vec))
        yield vec
        



In [13]:
from gensim import corpora, similarities
transformed = corpora.MmCorpus.serialize('data/d2v_corpus.mm', iter_arts(df))
index_d2v = similarities.Similarity('data/d2v_sims.index',
                                      corpora.MmCorpus('data/d2v_train_corpus.mm'),
                                      num_features=300)
index_d2v.save()

ValueError: need more than 0 values to unpack

In [None]:
corpora.MmCorpus.serialize('data/d2v_corpus.mm', iter_arts(df))