In [45]:
import numpy as np
import keras.backend as K
from keras.layers import Dense, Embedding, Lambda
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors

In [46]:
with open('text.txt','w') as file:
    file.write("""The speed of transmission is an important point of difference between the two viruses.
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms)
and a shorter serial interval (the time between successive cases) than COVID-19 virus.
The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days.
This means that influenza can spread faster than COVID-19.

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –
transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza.
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset,
at present, this does not appear to be a major driver of transmission.

The reproductive number – the number of secondary infections generated from one infected individual –
is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza.
However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.
""")

In [47]:
#reading data
data = open('text.txt','r')
#extracting sentence
sentence = [text for text in data if text.count(' ')>=2]
vectorize = Tokenizer()

In [48]:
#training for sentence
vectorize.fit_on_texts(sentence)
#tokenization on sentence
sentence = vectorize.texts_to_sequences(sentence)

In [49]:
#find total no. of sentence and words
total_vocab = sum(len(s) for s in sentence)
total_words = len(vectorize.word_index)+1
window_size = 2

In [50]:
def cbow_model(data, window_size, total_vocab):
    total_length = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            contextual = sequence.pad_sequences(context_word, total_length=total_length)
            final_target = utils.to_categorical(target, total_vocab)
            yield(contextual, final_target)

In [51]:
model = Sequential()
model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
model.add(Dense(total_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
for i in range(10):
    cost = 0
    for x, y in cbow_model(data, window_size, total_vocab):
        cost += model.train_on_batch(contextual, final_target)
    print(i, cost)

0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0


In [59]:
# Create vector file of some word for testing
dimensions=100
vect_file = open('../content/vectors.txt' ,'w')
vect_file.write('{} {}\n'.format(total_vocab,dimensions))

8

In [60]:
# Assign weights to your trained model
weights = model.get_weights()[0]
for text, i in vectorize.word_index.items():
    final_vec = ' '.join(map(str, list(weights[i, :])))
    vect_file.write('{} {}\n'.format(text, final_vec))
vect_file.close()

In [62]:
cbow_output = KeyedVectors.load_word2vec_format('../content/vectors.txt', binary=False, limit=100, encoding='latin-1')

In [63]:
cbow_output.most_similar(positive=['virus'])

[('comparisons', 0.33667847514152527),
 ('median', 0.27564844489097595),
 ('specific', 0.21048730611801147),
 ('understood', 0.17160819470882416),
 ('however', 0.1676071286201477),
 ('and', 0.16637516021728516),
 ('incubation', 0.1275118887424469),
 ('to', 0.12228561192750931),
 ('who', 0.12196187674999237),
 ('2', 0.11743484437465668)]