In [1]:
import logging
import gensim

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def read_input(input_file):
    with open (input_file, 'r', encoding='utf8') as f:
        for i,line in enumerate(f):
            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)
            
documents = list (read_input ('data/cord19_kaggle/custom-pdf_cord19_raw_data_v2.txt'))
logging.info ("Done reading data file")

2020-05-27 03:49:43,448 : INFO : read 0 reviews
2020-05-27 03:49:44,970 : INFO : read 10000 reviews
2020-05-27 03:49:46,401 : INFO : read 20000 reviews
2020-05-27 03:49:47,789 : INFO : read 30000 reviews
2020-05-27 03:49:49,219 : INFO : read 40000 reviews
2020-05-27 03:49:50,530 : INFO : read 50000 reviews
2020-05-27 03:49:51,943 : INFO : read 60000 reviews
2020-05-27 03:49:53,361 : INFO : read 70000 reviews
2020-05-27 03:49:54,641 : INFO : read 80000 reviews
2020-05-27 03:49:56,915 : INFO : read 90000 reviews
2020-05-27 03:49:58,498 : INFO : read 100000 reviews
2020-05-27 03:49:59,928 : INFO : read 110000 reviews
2020-05-27 03:50:01,377 : INFO : read 120000 reviews
2020-05-27 03:50:02,828 : INFO : read 130000 reviews
2020-05-27 03:50:05,476 : INFO : read 140000 reviews
2020-05-27 03:50:06,826 : INFO : read 150000 reviews
2020-05-27 03:50:08,232 : INFO : read 160000 reviews
2020-05-27 03:50:10,879 : INFO : read 170000 reviews
2020-05-27 03:50:12,224 : INFO : read 180000 reviews
2020-05

In [None]:
import time
start_time = time.time()   # calculate total training time

model = gensim.models.Word2Vec (documents,
                                size=200, 
                                window=10, 
                                min_count=10, 
                                sg=1, 
                                workers=7)
model.train(documents,total_examples=len(documents),epochs=5)
print("Training completed in %s seconds." % (time.time() - start_time))

In [None]:
# save model for convenience and prevent long training times
model.save("word2vec_sg_custom-pdf_cord19_raw_data_v2_837mb.bin")

In [7]:
from gensim.test.utils import datapath
# evaluate against wordsim-353 dataset, returns pearson and spearman coefficients
model.evaluate_word_pairs(datapath('wordsim353.tsv'))

2020-05-27 07:57:57,007 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-05-27 07:57:57,007 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
  This is separate from the ipykernel package so we can avoid doing imports until
2020-05-27 07:57:58,793 : INFO : Pearson correlation coefficient against /home/ccidecio/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.4697
2020-05-27 07:57:58,794 : INFO : Spearman rank-order correlation coefficient against /home/ccidecio/anaconda3/lib/python3.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.4567
2020-05-27 07:57:58,794 : INFO : Pairs with unknown words ratio: 4.2%


((0.4697024958058125, 5.97079244331236e-20),
 SpearmanrResult(correlation=0.45671514102812144, pvalue=8.046952154950104e-19),
 4.2492917847025495)

In [11]:
word = ["coronavirus"]
model.wv.most_similar(positive=word, topn = 10)

[('coronaviruses', 0.7892695665359497),
 ('corona', 0.7099049687385559),
 ('cov', 0.7075159549713135),
 ('betacoronavirus', 0.6912431716918945),
 ('covs', 0.6769834756851196),
 ('coronaviral', 0.6627583503723145),
 ('coronaviridae', 0.6545959711074829),
 ('virus', 0.6510963439941406),
 ('hcov', 0.6498087644577026),
 ('sars', 0.6487791538238525)]

In [10]:
word = ["corona"]
model.wv.most_similar(positive=word, topn = 10)

[('coronavirus', 0.7099050283432007),
 ('coronaviruses', 0.5861067771911621),
 ('rota', 0.5791054964065552),
 ('coronaviridae', 0.5378038287162781),
 ('togavirus', 0.5368736386299133),
 ('syncytial', 0.5273870229721069),
 ('calici', 0.5220611095428467),
 ('petal', 0.5193419456481934),
 ('plcorna', 0.5183882713317871),
 ('toro', 0.5107802152633667)]

In [12]:
word = ["spread"]
model.wv.most_similar(positive=word, topn = 10)

[('spreading', 0.7944375276565552),
 ('spreads', 0.7183796763420105),
 ('transmission', 0.7025080919265747),
 ('dissemination', 0.6902173757553101),
 ('dispersal', 0.6286017894744873),
 ('transmitted', 0.5987448692321777),
 ('emergence', 0.5906115770339966),
 ('outbreaks', 0.5720475912094116),
 ('transmitting', 0.5716133117675781),
 ('toperson', 0.5693148374557495)]

In [13]:
word = ["disease"]
model.wv.most_similar(positive=word, topn = 10)

[('diseases', 0.7756211161613464),
 ('illness', 0.6377783417701721),
 ('illnesses', 0.575861930847168),
 ('fatal', 0.566985547542572),
 ('pathology', 0.5628936290740967),
 ('debilitating', 0.5621656179428101),
 ('morbidity', 0.5568110346794128),
 ('infections', 0.5560518503189087),
 ('chronic', 0.5524715185165405),
 ('contagious', 0.5468723177909851)]

In [14]:
word = ["fever"]
model.wv.most_similar(positive=word, topn = 10)

[('fevers', 0.8199060559272766),
 ('myalgia', 0.795329213142395),
 ('chills', 0.7770258188247681),
 ('myalgias', 0.7716023921966553),
 ('arthralgia', 0.7509876489639282),
 ('headache', 0.7490516304969788),
 ('febrile', 0.7348302006721497),
 ('rigors', 0.713218092918396),
 ('sore', 0.7037040591239929),
 ('malaise', 0.6998950242996216)]

In [15]:
word = ["pneumonia"]
model.wv.most_similar(positive=word, topn = 10)

[('pneumonias', 0.858900785446167),
 ('pneumonitis', 0.7347062826156616),
 ('lobar', 0.686647891998291),
 ('bronchiolitis', 0.6854633092880249),
 ('lrti', 0.6762850880622864),
 ('necrotizing', 0.6573134660720825),
 ('monia', 0.6445252895355225),
 ('tracheitis', 0.6438243389129639),
 ('radiologically', 0.629547655582428),
 ('atypical', 0.6254950165748596)]

In [16]:
word = ["sars"]
model.wv.most_similar(positive=word, topn = 10)

[('cov', 0.893616795539856),
 ('mers', 0.7883569002151489),
 ('ncov', 0.7213670015335083),
 ('urbani', 0.6776567101478577),
 ('coronavirus', 0.6487791538238525),
 ('sarsassociated', 0.6247857213020325),
 ('drosten', 0.6160549521446228),
 ('scov', 0.6074730157852173),
 ('covinfected', 0.5945044755935669),
 ('ksiazek', 0.5894771814346313)]

In [20]:
word = ["cov"]
model.wv.most_similar(positive=word, topn = 10)

[('mers', 0.9025729894638062),
 ('sars', 0.893616795539856),
 ('emc', 0.7122606635093689),
 ('coronavirus', 0.7075159549713135),
 ('ncov', 0.7063145637512207),
 ('hcov', 0.6867600083351135),
 ('scov', 0.6807666420936584),
 ('covs', 0.6754277944564819),
 ('hku', 0.6661197543144226),
 ('urbani', 0.6451287269592285)]

In [21]:
word = ["mers"]
model.wv.most_similar(positive=word, topn = 10)

[('cov', 0.9025730490684509),
 ('sars', 0.7883569002151489),
 ('emc', 0.7174680233001709),
 ('dromedary', 0.677073061466217),
 ('arabia', 0.6653499603271484),
 ('saudi', 0.6644468903541565),
 ('ncov', 0.6634178161621094),
 ('camels', 0.6282460689544678),
 ('dromedaries', 0.6174394488334656),
 ('rbd', 0.6096728444099426)]

In [22]:
word = ["pandemic"]
model.wv.most_similar(positive=word, topn = 10)

[('pandemics', 0.8425107002258301),
 ('influenza', 0.7091113924980164),
 ('interpandemic', 0.7005625367164612),
 ('pdm', 0.6938039064407349),
 ('flu', 0.6925145387649536),
 ('outbreak', 0.683133602142334),
 ('uenza', 0.6830485463142395),
 ('epidemic', 0.6809103488922119),
 ('preparedness', 0.6794367432594299),
 ('epidemics', 0.6663550138473511)]

In [24]:
word = ["covid"]
model.wv.most_similar(positive=word, topn = 10)

[('ncov', 0.7544008493423462),
 ('wuhan', 0.6692982912063599),
 ('hubei', 0.6026048064231873),
 ('feb', 0.5619672536849976),
 ('sars', 0.5490542054176331),
 ('lockdown', 0.5469571352005005),
 ('ncip', 0.5465250015258789),
 ('jinyintan', 0.5342410206794739),
 ('pandemic', 0.5337203741073608),
 ('lombardy', 0.5331996083259583)]

In [25]:
word = ["wuhan"]
model.wv.most_similar(positive=word, topn = 10)

[('hubei', 0.8862814903259277),
 ('guangdong', 0.7769137024879456),
 ('guangzhou', 0.7485203742980957),
 ('ncov', 0.741247296333313),
 ('province', 0.7402205467224121),
 ('china', 0.7297977209091187),
 ('beijing', 0.72397780418396),
 ('shenzhen', 0.7019610404968262),
 ('guandong', 0.6946756839752197),
 ('huanan', 0.683606743812561)]

In [26]:
word = ["lung"]
model.wv.most_similar(positive=word, topn = 10)

[('lungs', 0.813745379447937),
 ('pulmonary', 0.7852080464363098),
 ('bronchial', 0.7084548473358154),
 ('alveolar', 0.6931854486465454),
 ('emphysema', 0.6539890170097351),
 ('airways', 0.6460864543914795),
 ('tissue', 0.6410072445869446),
 ('fibrotic', 0.6365219354629517),
 ('airway', 0.6251440644264221),
 ('bronchiolar', 0.6244347095489502)]

In [None]:
# similarity between two different words
model.wv.similarity(w1="recep",w2="tayyip")

In [None]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["vladimir","rusya","ekonomi"])

In [None]:
def saveModel(model,name):
    model.save(name)

def loadModel():
    model = Word2Vec.load('word2vec.bin')
    return model

In [None]:
def findCosineDistance(vector1, vector2):
    # similarity between two word vectors
    print("vec1:" + np.float64(np.linalg.norm(vector1)).astype(str) + "| vec2:" + np.float64(np.linalg.norm(vector2)).astype(str))
    return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

In [None]:
def findSimilarity(model, word1, word2):
    # similarity between two words
    return model.wv.similarity(w1=word1, w2=word2)

In [None]:
def printTestResults(words):
    for i in range(0,len(words)):
        print("\n"  + '\033[1m' + words[i] + '\033[0m')
        similar_words = model.wv.most_similar(positive=words[i], topn=5)
        for j in range(0,5):
            print(similar_words[j])

selected_words["coronavirus","disease"]
# word_count = 5      # most similar n words
print("window = " + str(window_size) + ", min_count = " + str(min_count) + ", sg = " + str(sg) + ", hs = " + str(hs) + ", epochs = " + str(epochs) )
print("training time: " + str(int(training_time)) + " seconds")

printTestResults(selected_words)

In [None]:
def getWordVector(token):
    #aggregate word vectors of a named entity
    words = token.split()
    wordVector = np.zeros(W2V_SIZE)
    for word in words:
        try:
            vector = model.wv.get_vector(word)
            wordVector = wordVector + vector
        except KeyError:
            print("KeyError: " + word)

    return wordVector

for i in range(0,len(tokens)-1):
    for j in range (i+1, len(tokens)):
        v1 = getWordVector(tokens[i])
        v2 = getWordVector(tokens[j])
        similarity = findCosineDistance(v1, v2)
        if similarity != 'nan' and similarity > SIMILARITY_THRESHOLD:
            print("hey" + "%s\t%s\t%s\n" % (i+1, j+1, similarity))
            file.write("%s\t%s\t%s\n" % (i+1, j+1, similarity))
            #file.write("%s\t%s\t%s\n"% (tokens[i], tokens[j], similarity))

file.close()