## Imports

In [1]:
import pandas as pd
import numpy as np

import regex as re
from nltk.tokenize import RegexpTokenizer
from gensim.models.word2vec import Word2Vec

## Read in data

In [2]:
articles = pd.read_csv('NLT_data/merged_data.csv')
articles.head(2)

Unnamed: 0,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,The genetic information of RNA viruses is orga...
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,Carcinoembryonic antigen (CEA; CD66e) was init...


## Tokenize text data

In [3]:
# Function that strips text of leading and trailing punctuation
import string

def strip_punc(list_of_words):
    return [word.strip(string.punctuation) for word in list_of_words]

In [4]:
# Function that lowercases, tokenizes, and strips the text of any trailing punctuation

def clean_text(text):
    # Cast text as string (floats in body text)
    text = str(text)
    
    # Lowercase all words
    lower = text.lower()
    
    # Tokenizes by whitespace
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    tokens = tokenizer.tokenize(lower)
    
    # Strip leading and trailing punctuation
    tokens_stripped = strip_punc(tokens)
    
    # Return tokens stripped of trailing punctuation
    return tokens_stripped

In [5]:
# Tokenize titles
articles['title'] = articles['title'].map(clean_text)

# Tokenize abstract
articles['abstract'] = articles['abstract'].map(clean_text)

# Tokenize body text
articles['text_body'] = articles['text_body'].map(clean_text)

In [6]:
articles.head(2)

Unnamed: 0,paper_id,source,title,abstract,publish_time,authors,journal,url,discussion,text_body
0,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,"[sequence, requirements, for, rna, strand, tra...","[nidovirus, subgenomic, mrnas, contain, a, lea...",2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,http://europepmc.org/articles/pmc125340?pdf=re...,,"[the, genetic, information, of, rna, viruses, ..."
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"[crystal, structure, of, murine, sceacam1a[1,4...","[ceacam1, is, a, member, of, the, carcinoembry...",2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,http://europepmc.org/articles/pmc125375?pdf=re...,,"[carcinoembryonic, antigen, cea, cd66e, was, i..."


## Word2Vec Model - CBOW

In [7]:
# Train a Word2Vec model on body text using bag of words

model_cbow = Word2Vec(articles['text_body'],
                      min_count=1,
                      workers=4)

In [8]:
model_cbow.wv.most_similar('coronavirus', topn=25)

[('cov', 0.8723200559616089),
 ('coronaviruses', 0.7552334070205688),
 ('betacoronavirus', 0.7502703666687012),
 ('sars-coronavirus', 0.7349566221237183),
 ('hcov-nl63', 0.7132015228271484),
 ('sars-cov', 0.6978799104690552),
 ('beta-coronavirus', 0.6974042057991028),
 ('paramyxovirus', 0.686292827129364),
 ('ncov', 0.675050675868988),
 ('covs', 0.6744565963745117),
 ('corona-virus', 0.674293041229248),
 ('alphacoronavirus', 0.6740212440490723),
 ('hcov-229e', 0.6699510216712952),
 ('torovirus', 0.6651463508605957),
 ('hcov-hku1', 0.6644760966300964),
 ('hcov', 0.6621612906455994),
 ('astrovirus', 0.6608228087425232),
 ('coronaviral', 0.6587368249893188),
 ('mers-cov', 0.6571252346038818),
 ('2019-ncov', 0.6556006073951721),
 ('picornavirus', 0.6457030177116394),
 ('hcov-emc', 0.642787754535675),
 ('sars-cov-2', 0.6416059732437134),
 ('coronavirus.the', 0.6406640410423279),
 ('calicivirus', 0.6383765339851379)]

In [9]:
# Update model with words from abstract
# https://www.machinelearningplus.com/nlp/gensim-tutorial/#15howtoupdateanexistingword2vecmodelwithnewdata

model_cbow.build_vocab(articles['abstract'], update=True)
model_cbow.train(articles['abstract'],
                 total_examples = model_cbow.corpus_count,
                 epochs = model_cbow.epochs)

(25451948, 31781460)

In [10]:
model_cbow.wv.most_similar('coronavirus', topn=25)

[('cov', 0.8179519176483154),
 ('betacoronavirus', 0.6815667748451233),
 ('coronaviruses', 0.6796875596046448),
 ('hcov-nl63', 0.6774649024009705),
 ('corona-virus', 0.6750600337982178),
 ('sars-coronavirus', 0.6544812917709351),
 ('beta-coronavirus', 0.6539711952209473),
 ('alphacoronavirus', 0.6444243788719177),
 ('coronaviral', 0.637790322303772),
 ('hcov-emc', 0.6352381110191345),
 ('coronovirus', 0.6346707940101624),
 ('eoronavirus', 0.6243880987167358),
 ('coronavirus.the', 0.6218816041946411),
 ('hcov', 0.6198548674583435),
 ('torovirus', 0.6195451617240906),
 ('astrovirus', 0.6194137334823608),
 ('hcov-229e', 0.6192142963409424),
 ('paramyxovirus', 0.6100278496742249),
 ('ncov', 0.6040126085281372),
 ('calicivirus', 0.60213702917099),
 ('sads-cov', 0.5989593267440796),
 ('sars-cov', 0.5982458591461182),
 ('hcov-oc43', 0.5957435965538025),
 ('seacov', 0.591794490814209),
 ('β-coronavirus', 0.591096818447113)]

## Word2Vec Model - SkipGram

In [11]:
# Train Word2Vec model on body text using SkipGram

model_sg = Word2Vec(articles['text_body'],
                    min_count=1,
                    sg=1,
                    workers=4)

In [12]:
model_sg.wv.most_similar('coronavirus', topn=25)

[('cov', 0.867962658405304),
 ('coronaviruses', 0.8325676918029785),
 ('sars-associated', 0.812114953994751),
 ('sarsassociated', 0.8041493892669678),
 ('coronovirus', 0.8034523725509644),
 ('corona', 0.8019854426383972),
 ('hecv', 0.7925291061401367),
 ('sars-cov', 0.7923038005828857),
 ('navirus', 0.790370523929596),
 ('229e', 0.7846354246139526),
 ('hcov', 0.7732962369918823),
 ('hcov-229e', 0.7673474550247192),
 ('nl-63', 0.763729453086853),
 ('beta-coronavirus', 0.7631218433380127),
 ('ncov', 0.7621225118637085),
 ('sars-coronavirus', 0.7605894207954407),
 ('co-v', 0.7580375075340271),
 ('sarscov-2', 0.7563889026641846),
 ('hcov-nl63', 0.7547838687896729),
 ('bat-hku2-like', 0.7542920112609863),
 ('sars-cov-1', 0.7542612552642822),
 ('betacoronavirus', 0.754182755947113),
 ('alphacoronavirus', 0.7532768249511719),
 ('sars-hcov', 0.7530182600021362),
 ('oc43', 0.7524147033691406)]

In [13]:
# Update model with words from abstract

model_sg.build_vocab(articles['abstract'], update=True)
model_sg.train(articles['abstract'],
               total_examples = model_sg.corpus_count,
               epochs = model_sg.epochs)

(25451807, 31781460)

In [14]:
model_sg.wv.most_similar('coronavirus', topn=25)

[('cov', 0.8884381055831909),
 ('coronovirus', 0.831635594367981),
 ('sars-cov', 0.8238817453384399),
 ('sar-cov-2', 0.8229975700378418),
 ('sarsassociated', 0.8216931819915771),
 ('coronaviruses', 0.8208870887756348),
 ('sars-associated', 0.8177621364593506),
 ('co-v', 0.8140263557434082),
 ('sarscov-2', 0.8135048747062683),
 ('navirus', 0.8124586343765259),
 ('corona-virus', 0.8109472990036011),
 ('cov2', 0.8016222715377808),
 ('sars-co', 0.8007255792617798),
 ('corona', 0.7987117767333984),
 ('sars-cov-2', 0.7936427593231201),
 ('sars-hcov', 0.7934179306030273),
 ('sars-coronavirus', 0.7905791997909546),
 ('oc43', 0.7894622087478638),
 ('ronavirus', 0.7834591865539551),
 ('sars-coa', 0.7817895412445068),
 ('sarscov2', 0.7810839414596558),
 ('sads', 0.7786059379577637),
 ('229e', 0.7775371670722961),
 ('coro-navirus', 0.7773590087890625),
 ('beta-coronavirus', 0.77623450756073)]