<a href="https://colab.research.google.com/github/adityakangune/IT-LP-4-Codes-/blob/main/Chit_6_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Chit 6**

### *Name:Aditya Kangune*
### *Roll number: 43321*
### *Batch: P11*

*Problem Statement:*

    Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps:
    a. Data preparation
    b. Generate training data
    c. Train model
    d. Output



Note: Save the following text in a corona.txt file before

In [None]:
"""
The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  
"""

# Importing libraries

In [None]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [None]:
data=open('/content/corona.txt','r')
corona_data = [text for text in data if text.count(' ') >= 2]
vectorize = Tokenizer()

## Fit data to tokenizer


In [None]:
vectorize.fit_on_texts(corona_data)
corona_data = vectorize.texts_to_sequences(corona_data)

In [None]:
# Find total no of words and total no of sentences.
total_vocab = sum(len(s) for s in corona_data)
word_count = len(vectorize.word_index) + 1
window_size = 2

In [None]:
# Generate the pairs of Context words and target words
def cbow_model(data, window_size, total_vocab):
    total_length = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            contextual = sequence.pad_sequences(context_word, total_length=total_length)
            final_target = np_utils.to_categorical(target, total_vocab)
            yield(contextual, final_target)

Create Neural Network model with following parameters :

    Model type : sequential
    
    Layers : Dense , Lambda , embedding. Compile

    Options : (loss='categorical_crossentropy', optimizer='adam')

In [None]:
model = Sequential()
model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
model.add(Dense(total_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
for i in range(10):
    cost = 0
    for x, y in cbow_model(data, window_size, total_vocab):
        cost += model.train_on_batch(contextual, final_target)
    print(i, cost)

0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0


In [None]:
# Create vector file of some word for testing
dimensions=100
vect_file = open('/content/vectors.txt' ,'w')
vect_file.write('{} {}\n'.format(total_vocab,dimensions))

8

In [None]:

# Assign weights to your trained model
weights = model.get_weights()[0]
for text, i in vectorize.word_index.items():
    final_vec = ' '.join(map(str, list(weights[i, :])))
    vect_file.write('{} {}\n'.format(text, final_vec))
vect_file.close()

In [None]:
# Use the vectors created in Gemsim
cbow_output = gensim.models.KeyedVectors.load_word2vec_format('/content/vectors.txt', binary = False, limit=100)

In [None]:
# choose the word to get similar type of words
cbow_output.most_similar(positive=['virus'])

[('19', 0.2752428650856018),
 ('24', 0.2098691165447235),
 ('of', 0.1796838790178299),
 ('between', 0.16798871755599976),
 ('in', 0.14854368567466736),
 ('period', 0.14530067145824432),
 ('symptomatic', 0.14341437816619873),
 ('further', 0.12186173349618912),
 ('influenza', 0.11704269051551819),
 ('appearance', 0.1145254373550415)]

In [None]:
# #tokenization
# tokenizer = text.Tokenizer()
# tokenizer.fit_on_texts(dl_data)
# word2id = tokenizer.word_index

# word2id['PAD'] = 0
# id2word = {v:k for k, v in word2id.items()}
# wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

# vocab_size = len(word2id)
# embed_size = 100
# window_size = 2 

# print('Vocabulary Size:', vocab_size)
# print('Vocabulary Sample:', list(word2id.items())[:10])

In [None]:
# #generating (context word, target/label word) pairs
# def generate_context_word_pairs(corpus, window_size, vocab_size):
#     context_length = window_size*2
#     for words in corpus:
#         sentence_length = len(words)
#         for index, word in enumerate(words):
#             context_words = []
#             label_word   = []            
#             start = index - window_size
#             end = index + window_size + 1
            
#             context_words.append([words[i] 
#                                  for i in range(start, end) 
#                                  if 0 <= i < sentence_length 
#                                  and i != index])
#             label_word.append(word)

#             x = pad_sequences(context_words, maxlen=context_length)
#             y = np_utils.to_categorical(label_word, vocab_size)
#             yield (x, y)
            
# i = 0
# for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
#     if 0 not in x[0]:
#         # print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
#         if i == 10:
#             break
#         i += 1

In [None]:
# #model building
# import keras.backend as K
# from keras.models import Sequential
# from keras.layers import Dense, Embedding, Lambda

# cbow = Sequential()
# cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
# cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
# cbow.add(Dense(vocab_size, activation='softmax'))
# cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# print(cbow.summary())

# # from IPython.display import SVG
# # from keras.utils.vis_utils import model_to_dot

# # SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))

In [None]:
# for epoch in range(1, 6):
#     loss = 0.
#     i = 0
#     for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
#         i += 1
#         loss += cbow.train_on_batch(x, y)
#         if i % 100000 == 0:
#             print('Processed {} (context, word) pairs'.format(i))

#     print('Epoch:', epoch, '\tLoss:', loss)
#     print()

In [None]:
# weights = cbow.get_weights()[0]
# weights = weights[1:]
# print(weights.shape)

# pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

In [None]:
# from sklearn.metrics.pairwise import euclidean_distances

# distance_matrix = euclidean_distances(weights)
# print(distance_matrix.shape)

# similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
#                    for search_term in ['disease']}

# similar_words