In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
all = pd.read_csv('PubMed10000_withID.csv')

In [3]:
all_stopwords = stopwords.words('english')
stemmer = PorterStemmer()
corpus = []

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

for index, article in all.iterrows():
  if(article['abstract'] is not np.nan):
    for sentence in sent_tokenize(article['abstract']):
      withoutPunc_tokens = [BAD_SYMBOLS_RE.sub('', REPLACE_BY_SPACE_RE.sub('', token.lower())) for token in list(filter(lambda token: token not in string.punctuation, word_tokenize(sentence)))]
      withoutPuncStopword_tokens = [word for word in withoutPunc_tokens if not word in all_stopwords]
      corpus.append([stemmer.stem(word) for word in withoutPuncStopword_tokens]) 


In [4]:
def most_similar(w2v_model, words, topn=10):
  similar_df = pd.DataFrame()
  for word in words:
    try:
      similar_words = pd.DataFrame(w2v_model.wv.most_similar(stemmer.stem(word), topn=topn), columns=[word, 'cos'])
      similar_df = pd.concat([similar_df, similar_words], axis=1)
    except:
      print(word, "not found in Word2Vec model!")
  return similar_df

# CBOW

In [5]:
model = Word2Vec(sentences=corpus, size=100, iter=25, window=5, min_count=10, workers=4, sg=0)

In [6]:
model.save('word2vec.model')

In [7]:
most_similar(model, ['disease', 'covid19', 'sarscov2', 'flu', 'test', 'treatment', 'china'], topn=10)

Unnamed: 0,disease,cos,covid19,cos.1,sarscov2,cos.2,flu,cos.3,test,cos.4,treatment,cos.5,china,cos.6
0,ill,0.574167,sarscov2,0.565474,viral,0.5951,influenza,0.763036,screen,0.624909,therapi,0.736698,hubei,0.669511
1,covid19,0.560992,diseas,0.560992,viru,0.583009,h1n1,0.676379,assay,0.543638,treat,0.717736,wuhan,0.65828
2,infect,0.508633,infect,0.420284,covid19,0.565474,season,0.672135,pcr,0.489468,therapeut,0.618498,provinc,0.653295
3,complic,0.485931,cancer,0.419085,virus,0.51681,1918,0.661612,retest,0.4802,immunotherapi,0.533298,citi,0.581038
4,pneumonia,0.482671,delay,0.395376,2019ncov,0.494816,pneumococc,0.646737,detect,0.472241,cure,0.519354,broke,0.579993
5,disease19,0.423135,clinic,0.371175,pathogen,0.457603,20192020,0.638161,result,0.447381,manag,0.505705,hunan,0.554832
6,tuberculosi,0.416178,particularli,0.365138,hcov,0.450039,2009,0.61992,routin,0.445164,antivir,0.462902,seafood,0.552367
7,sepsi,0.414096,covid,0.359695,coronavirus,0.445461,scov,0.594529,laboratorybas,0.443333,prophylaxi,0.452735,guangzhou,0.51124
8,pandem,0.412906,initi,0.358112,sever,0.402717,ebola,0.522246,rdt,0.433388,antibiot,0.45072,japan,0.503318
9,injuri,0.399516,prompt,0.353557,sar,0.397162,unsur,0.516951,diagnosi,0.421011,cur,0.448405,guangdong,0.492671


# Skip-Gram

In [8]:
model = Word2Vec(sentences=corpus, size=100, iter=25, window=5, min_count=10, workers=4, sg=1)

In [9]:
most_similar(model, ['disease', 'covid19', 'sarscov2', 'flu', 'test', 'treatment', 'china'], topn=10)

Unnamed: 0,disease,cos,covid19,cos.1,sarscov2,cos.2,flu,cos.3,test,cos.4,treatment,cos.5,china,cos.6
0,covid19,0.648194,diseas,0.648194,viru,0.652558,influenza,0.623158,pcr,0.637808,treat,0.771058,wuhan,0.761605
1,sever,0.588645,2019,0.596328,respiratori,0.625136,pneumococc,0.606324,rtpcr,0.615145,therapi,0.761461,provinc,0.758402
2,infect,0.578867,coronaviru,0.551827,coronaviru,0.612779,h1n1,0.603164,antigendetect,0.614318,therapeut,0.68813,hubei,0.709351
3,2019,0.57848,background,0.531955,acut,0.611123,ebola,0.601957,laboratorybas,0.609261,tocilizumab,0.640551,broke,0.669835
4,broke,0.571766,ongo,0.531874,syndromerel,0.602576,season,0.601805,naat,0.598825,antivir,0.62268,hunan,0.665822
5,ill,0.571049,sarscov2,0.523482,cov2,0.598766,1918,0.592285,assay,0.592898,cur,0.594354,citi,0.639003
6,disease2019,0.570718,panic,0.515027,viral,0.591345,2009,0.584251,screen,0.591612,manag,0.580537,decemb,0.629656
7,pandem,0.548934,novel,0.503809,sever,0.588902,zika,0.58104,serolog,0.583019,dexamethason,0.57022,seafood,0.610342
8,prognosi,0.542397,sever,0.502365,sar,0.583802,20192020,0.531445,indicaid,0.55857,anticanc,0.567271,aetiolog,0.600164
9,acut,0.539912,broke,0.501209,syndromecoronaviru,0.576115,syncyti,0.500086,rtrtpcr,0.550246,ici,0.562868,mainland,0.574294
