In [1]:
from keras.preprocessing import text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras import backend as K

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [2]:
data = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult. """

In [3]:
dl_data = data.split()

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id = tokenizer.word_index
words2id['PAD']=0

id2words = {v:k for k,v in words2id.items()}
wid = [[words2id[w] for w in text.text_to_word_sequence(doc)]for doc in dl_data]

In [5]:
vocab_size = len(words2id)
window_size = 2
embed_size = 10

In [6]:
def func(corpus, vocab_size,window_size):
    context_length = window_size*2
    for words in corpus:
        sent_length = len(words)
        for index,word in enumerate(words):
            context_words = []
            label_word = []
            start = index-window_size
            end = index+window_size+1
            
            context_words.append([words[i]
                                 for i in range(start,end)
                                 if 0<=i<sent_length
                                 and i!=index])
            label_word.append(word)
            
            x = pad_sequences(context_words,maxlen=context_length)
            y = to_categorical(label_word,vocab_size)
            yield(x,y)

In [7]:
model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=embed_size,input_length=window_size*2),
    Lambda(lambda x: K.mean(x,axis=1),output_shape=(embed_size,)),
    Dense(vocab_size,activation="softmax")
])

In [8]:
model.compile(loss="categorical_crossentropy", optimizer="rmsprop",metrics=["accuracy"])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             1030      
                                                                 
 lambda (Lambda)             (None, 10)                0         
                                                                 
 dense (Dense)               (None, 103)               1133      
                                                                 
Total params: 2163 (8.45 KB)
Trainable params: 2163 (8.45 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
for epoch in range(1,6):
    i=0
    loss=0
    for x,y in func(corpus=wid, window_size=window_size, vocab_size=vocab_size):
        i+=1
        loss+=model.train_on_batch(x,y)[0]
        if i%100000 ==0:
            print('Processes',i,'(corpus,word) terms')
    print('Epoch: ',epoch,'\tLoss: ',loss)
    print()

Epoch:  1 	Loss:  917.4237174987793

Epoch:  2 	Loss:  908.8883762359619

Epoch:  3 	Loss:  899.744562625885

Epoch:  4 	Loss:  889.3274736404419

Epoch:  5 	Loss:  879.5410194396973



In [11]:
weights = model.get_weights()[0]
weights = weights[1:]

In [12]:
pd.DataFrame(weights, index=list(id2words.values())[1:]).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
of,0.040247,0.018244,-0.031608,0.032664,0.035354,0.012696,0.003693,0.006675,0.003767,-0.032633
influenza,0.036419,-0.029791,0.028688,-0.035375,0.046487,0.004625,-0.020219,-0.021932,-0.045724,-0.048984
covid,0.028127,-0.015217,-0.000557,-0.025198,-0.0373,0.033265,-0.00489,-0.024451,0.044504,0.04161
19,-0.093995,0.124334,0.008336,-0.100496,-0.107128,-0.055646,-0.060306,-0.033038,0.048533,-0.066946
virus,0.137034,-0.036427,-0.106467,-0.059263,0.050309,-0.092068,-0.07993,-0.040478,0.081816,-0.045498


In [13]:
distance_matrix=euclidean_distances(weights)

In [14]:
pd.DataFrame(distance_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,0.000000,0.121995,0.139622,0.287105,0.230398,0.132086,0.110336,0.131178,0.100076,0.079794,...,0.143184,0.098783,0.126662,0.102462,0.155672,0.092803,0.113730,0.093594,0.080319,0.097222
1,0.121995,0.000000,0.160219,0.289110,0.241986,0.126486,0.126845,0.097858,0.122168,0.128257,...,0.162343,0.159406,0.140426,0.134431,0.123095,0.155384,0.151018,0.155899,0.159124,0.134757
2,0.139622,0.160219,0.000000,0.260507,0.250877,0.105514,0.153222,0.104277,0.121656,0.110565,...,0.109175,0.160090,0.089850,0.149345,0.160755,0.112692,0.110171,0.105143,0.127725,0.141624
3,0.287105,0.289110,0.260507,0.000000,0.349599,0.209047,0.274133,0.259305,0.301806,0.252397,...,0.269723,0.243819,0.246628,0.240878,0.214189,0.256928,0.267217,0.271887,0.241281,0.276452
4,0.230398,0.241986,0.250877,0.349599,0.000000,0.249216,0.264272,0.257066,0.238838,0.239481,...,0.294136,0.241369,0.271369,0.279032,0.253863,0.250164,0.201481,0.204477,0.236040,0.249650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.092803,0.155384,0.112692,0.256928,0.250164,0.124273,0.109066,0.129569,0.137328,0.093210,...,0.114175,0.116023,0.115519,0.090313,0.162687,0.000000,0.122414,0.074165,0.075877,0.093062
98,0.113730,0.151018,0.110171,0.267217,0.201481,0.102300,0.135293,0.133110,0.116261,0.076514,...,0.129636,0.089962,0.096612,0.132202,0.144178,0.122414,0.000000,0.082391,0.110509,0.106392
99,0.093594,0.155899,0.105143,0.271887,0.204477,0.119704,0.132959,0.117725,0.122726,0.098329,...,0.135910,0.112446,0.107456,0.127629,0.159924,0.074165,0.082391,0.000000,0.100078,0.081282
100,0.080319,0.159124,0.127725,0.241281,0.236040,0.121194,0.098789,0.151689,0.133700,0.083750,...,0.128709,0.081974,0.126402,0.095530,0.147118,0.075877,0.110509,0.100078,0.000000,0.112625


In [15]:
inwords = input()

similar_words = {search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
                   for search_term in {inwords}}

similar_words

covid


{'covid': ['influenza', 'means', '19', '6', 'hours', 'viruses']}