#Initializations

In [None]:
!pip install nltk
!pip install autocorrect
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
from nltk.corpus import gutenberg
import numpy as np
import pickle as pkl
from google.colab import drive


In [None]:
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/CS772/Assg1/'

#Dataset Creation

In [None]:
book_names = gutenberg.fileids()
sentences_list = []
for i in range(len(book_names)):
  book_sents = gutenberg.sents(book_names[i])
  sentences_list+= list(book_sents)
print(len(sentences_list))


In [None]:
from nltk import tokenize
import re
with open(data_path + f'Analogy_dataset.txt', 'r') as f:
  analogy_words = f.read().split()
analogy_words = [*set(analogy_words)]
#print(len(analogy_lines))

num_sent_max = 300

import requests
from bs4 import BeautifulSoup

for i in range(len(analogy_words)):
  wordi = analogy_words[i]
  # Making a GET request
  r = requests.get('https://en.wikipedia.org/wiki/' + wordi)
  # Parsing the HTML
  soup = BeautifulSoup(r.content, 'html.parser')
  #print(soup.prettify())
  ss = soup.find_all('div', class_ ='mw-parser-output')
  counter = 0
  indic = 0
  for s in ss:
    content = s.find_all('p')

    for line in content:
      thisline = line.text
      thissents = tokenize.sent_tokenize(thisline)
      for sentence in thissents: #checking all sentences of a particular wiki page
        if wordi in sentence:
          while not sentence[0].isalpha():  #to get a clean sentence without [21] etc from wikipedia.
            sentence = sentence[1:]
          counter = counter+1
          sentences_list.append(sentence)
          sentence = "".join(re.split("\(|\)|\[|\]", sentence)[::2])
          print(sentence)
          if counter > num_sent_max:
            indic=1
            break
      if indic == 1:
        break

In [None]:
len(sentences_list)

Data Pre-processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from autocorrect import Speller

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('omw-1.4')
  

stop_words = set(stopwords.words("english"))
english_words = set(words.words())

spell = Speller()

def preprocess_sentences(sentences):
    preprocessed_sentences = []
    for words in sentences:
        # words = word_tokenize(sentence)
        # words = [spell(word) for word in words]
        words = [word.lower() for word in words]
        words = [word for word in words if word.isalpha() and word not in stop_words and word in english_words]
        words = list(set(words))
        preprocessed_sentence = " ".join(words)
        preprocessed_sentences.append(preprocessed_sentence)
    return list(set(preprocessed_sentences))

# sentences = [
#     "This is a sentence for spell corection.",
#     "Another sentence for spell correctiong",
#     "Stop words are removed in this sentence",
#     "This sentence is lemmatized",
#     "And this sentence is deduplicated"
# ]

# preprocessed_sentences = preprocess_sentences(sentences)
# print(preprocessed_sentences)
processed_sentences = preprocess_sentences(sentences_list)

In [None]:
with open(data_path + f"processed_sentences.pkl","wb") as f:
  pkl.dump(processed_sentences, f)

#Unimportant/can be deleted

In [None]:

from tqdm import tqdm

# activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# gradient
def sigmoid_derivative(out):
    return out * (1 - out)

# Define the training function for the CBOW model
def train_cbow(sentences_list, learning_rate=0.1, epochs=100):
    # Initialize the word vectors for each word in the vocabulary
    vocab = set()
    for sentence in sentences_list:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)
    print(vocab)
    vocab_size = len(vocab)
    l2_neurons = 300
    batch_size = 50



    # Context size for neighbouring words in sentence
    context = 2

    # Initialize the weight matrices for the hidden and output layers
    W1 = np.random.randn(vocab_size, l2_neurons)
    W2 = np.random.randn(l2_neurons, vocab_size)

    for epoch in range(epochs):
        # Loop through each sentence in the training data
        numtotal = 0
        for sentence in tqdm(sentences_list):
            # Initialize the hidden layer activations
            hidden_layer = np.zeros((1, l2_neurons))
            words = sentence.split()
            # Loop through each word in the sentence
            for i, word in enumerate(words):
                numtotal += 1
                # Look up the word vector for the current word
                list_of_onehot_vecs = []
                j = 1
                while(i-j>=0 and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i - j])] = 1
                    list_of_onehot_vecs.append(target_word_vector)
                    j +=1
                j = 1
                while(i +j < len(words) and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i + j])] = 1
                    list_of_onehot_vecs.append(target_word_vector) 
                    j +=1

                word_input_vector = np.mean(list_of_onehot_vecs, axis = 0)
                
                # Compute the input to the hidden layer
                hidden_layer_input = np.dot(word_input_vector, W1)

                # Compute the activations in the hidden layer
                hidden_layer = sigmoid(hidden_layer_input)

                # Compute the input to the output layer
                output_layer_input = np.dot(hidden_layer, W2)

                # Compute the activations in the output layer
                output_layer = sigmoid(output_layer_input)

                # Compute the target word vector
                target_word_vector = np.zeros((1, vocab_size))
                target_word_vector[0, vocab.index(words[i])] = 1
                

                # Compute the error in the output layer
                error = target_word_vector - output_layer

                # Compute the gradients in the output layer
                output_layer_delta = -error * sigmoid_derivative(output_layer)

                # Compute the error in the hidden layer
                hidden_layer_error = np.dot(output_layer_delta, W2.T)

                # Compute the gradients in the hidden layer
                hidden_layer_delta = hidden_layer_error * sigmoid_derivative(hidden_layer)

                # Update the weights
                W2 -= learning_rate * np.dot(hidden_layer.T, output_layer_delta)
                W1 -= learning_rate * np.dot(word_input_vector.T, hidden_layer_delta)


    # Return the trained word vectors and weight matrices
    return W1, W2

In [None]:

from tqdm import tqdm

# activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# gradient
def sigmoid_derivative(out):
    return out * (1 - out)

# Define the training function for the CBOW model
def train_cbow(sentences_list, learning_rate=0.1, epochs=100):
    # Initialize the word vectors for each word in the vocabulary
    vocab = set()
    for sentence in sentences_list:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)
    print(vocab)
    vocab_size = len(vocab)
    l2_neurons = 300
    batch_size = 50



    # Context size for neighbouring words in sentence
    context = 2

    # Initialize the weight matrices for the hidden and output layers
    W1 = np.random.randn(vocab_size, l2_neurons)
    W2 = np.random.randn(l2_neurons, vocab_size)

    for epoch in range(epochs):
        # Loop through each sentence in the training data
        numtotal = 0
        for sentence in tqdm(sentences_list):
            # Initialize the hidden layer activations
            hidden_layer = np.zeros((1, l2_neurons))
            words = sentence.split()
            # Loop through each word in the sentence
            for i, word in enumerate(words):
                numtotal += 1
                # Look up the word vector for the current word
                list_of_onehot_vecs = []
                j = 1
                while(i-j>=0 and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i - j])] = 1
                    list_of_onehot_vecs.append(target_word_vector)
                    j +=1
                j = 1
                while(i +j < len(words) and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i + j])] = 1
                    list_of_onehot_vecs.append(target_word_vector) 
                    j +=1

                word_input_vector = np.mean(list_of_onehot_vecs, axis = 0)
                
                
                # Compute the input to the hidden layer
                hidden_layer_input = np.dot(word_input_vector, W1)

                if(len(list_of_onehot_vecs)==0):
                  continue
              word_input_vector = np.mean(list_of_onehot_vecs, axis = 0)
              #print("pre_vect",word_input_vector.shape)
              total_batch_vecsin.append(word_input_vector)
              target_word_vector = np.zeros((1, vocab_size))
              target_word_vector[0, vocab.index(words[i])] = 1
              total_batch_vecsout.append(target_word_vector)
              #print("total_batch",np.array(total_batch_vecsin).shape)

          if numtotal % batch_size == 0:
            # Initialize the hidden layer activations
                word_input_vector = np.array(total_batch_vecsin)
                target_word_vector = np.array(total_batch_vecsout)

                print('word_input shape', word_input_vector.shape)

                total_batch_vecsin=[]
                total_batch_vecsout=[]

                print("word_vect",word_input_vector.shape)



                # Compute the activations in the hidden layer
                hidden_layer = sigmoid(hidden_layer_input)

                # Compute the input to the output layer
                output_layer_input = np.dot(hidden_layer, W2)

                # Compute the activations in the output layer
                output_layer = sigmoid(output_layer_input)

                # Compute the target word vector
                target_word_vector = np.zeros((1, vocab_size))
                target_word_vector[0, vocab.index(words[i])] = 1
                

                # Compute the error in the output layer
                error = target_word_vector - output_layer

                # Compute the gradients in the output layer
                output_layer_delta = -error * sigmoid_derivative(output_layer)

                # Compute the error in the hidden layer
                hidden_layer_error = np.dot(output_layer_delta, W2.T)

                # Compute the gradients in the hidden layer
                hidden_layer_delta = hidden_layer_error * sigmoid_derivative(hidden_layer)

                # Update the weights
                W2 -= learning_rate * np.dot(hidden_layer.T, output_layer_delta)
                W1 -= learning_rate * np.dot(word_input_vector.T, hidden_layer_delta)


    # Return the trained word vectors and weight matrices
    return W1, W2

In [None]:
# activation function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# gradient
def sigmoid_derivative(out):
    return out * (1 - out)

# Define the training function for the CBOW model
def train_cbow_batched(sentences_list, learning_rate=0.1, epochs=100):
    # Initialize the word vectors for each word in the vocabulary
    vocab = set()
    for sentence in sentences_list:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)
    print(vocab)
    vocab_size = len(vocab)
    l2_neurons = 300
    



    # Context size for neighbouring words in sentence
    context = 2

    # Initialize the weight matrices for the hidden and output layers
    W1 = np.random.randn(vocab_size, l2_neurons)
    W2 = np.random.randn(l2_neurons, vocab_size)

    batch_size = 20
    numtotal = 0

    for epoch in range(epochs):
        total_batch_vecsin=[]
        total_batch_vecsout=[]
        
        # Loop through each sentence in the training data
        for sentence in tqdm(sentences_list):
          numtotal = numtotal+1
          
          hidden_layer = np.zeros((1, l2_neurons))
          words = sentence.split()
          # Loop through each word in the sentence
          list_of_onehot_vecs = []

          for i, word in enumerate(words):
              # Look up the word vector for the current word
              j = 1
              while(i-j>=0 and j<context):
                  target_word_vector = np.zeros((1, vocab_size))
                  target_word_vector[0, vocab.index(words[i - j])] = 1
                  list_of_onehot_vecs.append(target_word_vector)
                  j +=1
              j = 1
              while(i +j < len(words) and j<context):
                  target_word_vector = np.zeros((1, vocab_size))
                  target_word_vector[0, vocab.index(words[i + j])] = 1
                  list_of_onehot_vecs.append(target_word_vector) 
                  j +=1
              
              if(len(list_of_onehot_vecs)==0):
                  continue
              word_input_vector = np.mean(list_of_onehot_vecs, axis = 0)
              #print("pre_vect",word_input_vector.shape)
              total_batch_vecsin.append(word_input_vector)
              target_word_vector = np.zeros((1, vocab_size))
              target_word_vector[0, vocab.index(words[i])] = 1
              total_batch_vecsout.append(target_word_vector)
              #print("total_batch",np.array(total_batch_vecsin).shape)

          if numtotal % batch_size == 0:
            # Initialize the hidden layer activations
                word_input_vector = np.array(total_batch_vecsin)
                target_word_vector = np.array(total_batch_vecsout)

                #print('word_input shape', word_input_vector.shape)

                total_batch_vecsin=[]
                total_batch_vecsout=[]

                #print("word_vect",word_input_vector.shape)

                # Compute the input to the hidden layer
                hidden_layer_input = np.dot(word_input_vector, W1)

                # Compute the activations in the hidden layer
                hidden_layer = sigmoid(hidden_layer_input)

                # Compute the input to the output layer
                output_layer_input = np.dot(hidden_layer, W2)

                # Compute the activations in the output layer
                output_layer = sigmoid(output_layer_input)

                # Compute the error in the output layer
                error = target_word_vector - output_layer

                # Compute the gradients in the output layer
                output_layer_delta = -error * sigmoid_derivative(output_layer)

                # Compute the error in the hidden layer
                hidden_layer_error = np.dot(output_layer_delta, W2.T)

                # Compute the gradients in the hidden layer
                hidden_layer_delta = hidden_layer_error * sigmoid_derivative(hidden_layer)

                # Update the weights
                out1 = np.zeros((hidden_layer.shape[0], hidden_layer.shape[2], output_layer_delta.shape[2]))
                out2 = np.zeros((hidden_layer.shape[0], word_input_vector.shape[2], hidden_layer_delta.shape[2]))

                def parallelize(hidden_layer, hidden_layer_delta, word_input_vcetor, output_layer_delta):
                    out1 = np.dot(hidden_layer.T, output_layer_delta)
                    out2 = np.dot(word_input_vector.T, hidden_layer_delta)

                print('hidden: ', hidden_layer.shape, 'output delta: ', output_layer_delta.shape, "input: ", word_input_vector.shape, 'hidden: ', hidden_layer_delta.shape)
                #result = functorch.vmap(parallelize)(hidden_layer, hidden_layer_delta, word_input_vector, output_layer_delta)
                
                W2 -= learning_rate * np.dot(hidden_layer.squeeze().T, output_layer_delta.squeeze())
                W1 -= learning_rate * np.dot(word_input_vector.squeeze().T, hidden_layer_delta.squeeze())
                
                # for ind in range(hidden_layer.shape[0]):
                #    out1[ind] = np.dot(hidden_layer[i].T, output_layer_delta[i])
                #    out2[ind] = np.dot(word_input_vector[i].T, hidden_layer_delta[i])
                # out1 = np.einsum('ijk,ijk->ij', hidden_layer, output_layer_delta)
                # out2 = np.einsum('ijk,ijk->ij', word_input_vector, hidden_layer_delta)

                # print("out 1 shape",np.dot(hidden_layer.reshape((hidden_layer.shape[0],hidden_layer.shape[2], hidden_layer.shape[1])), output_layer_delta).shape)
                # print("out 2 shape",np.dot(word_input_vector.reshape((word_input_vector.shape[0],word_input_vector.shape[2], word_input_vector.shape[1])), hidden_layer_delta).shape)

                # W2 -= learning_rate * np.sum(out1, axis = 0)
                # W1 -= learning_rate * np.sum(out2, axis = 0)

        break

    # Return the trained word vectors and weight matrices
    return W1, W2

In [None]:
#import numpy as np
a = [np.array([0,1,0]), np.array([1,0,0])]
print(np.mean(a, axis = 0))

In [None]:
W1, W2 = train_cbow_batched(processed_sentences, learning_rate=0.1, epochs=5)
processed_sentences[0]

with open('cbow.npy', 'wb') as f:
    np.save(f, W1)
    np.save(f, W2)

#Important stuff starts here

In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
import functorch
import pickle as pkl
from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/My Drive/CS772/Assg1/'



# activation function
def softmax(x):
    e_x = torch.exp(x - torch.max(x))
    return e_x / e_x.sum()

# gradient
def softmax_derivative(out, truth):
    indices = torch.argmax(truth, 1, keepdim=True)
    return  -(torch.where(truth>0, out, 0)-(torch.gather(out, 1, indices)*out))*1/(torch.gather(out, 1, indices))

# Define the training function for the CBOW model
def train_cbow_torch(sentences_list, learning_rate=0.1, epochs=100):
    # Initialize the word vectors for each word in the vocabulary
    vocab = set()
    for sentence in sentences_list:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)
    print(vocab)
    vocab_size = len(vocab)
    l2_neurons = 300

    # Context size for neighbouring words in sentence
    context = 2

    # Initialize the weight matrices for the hidden and output layers
    W1 = torch.randn(vocab_size, l2_neurons, requires_grad=True)
    W2 = torch.randn(l2_neurons, vocab_size, requires_grad=True)

    batch_size = 100
    numtotal = 0

    for epoch in range(epochs):
        indic_batch = 0
        total_batch_vecsin=[]
        total_batch_vecsout=[]
        
        # Loop through each sentence in the training data
        for inum, sentence in enumerate(tqdm(sentences_list)):
          
          #Saving checkpoints to deal with colab bullshit
          if (inum+1)%(len(sentences_list)/4) == 0:
            with open(data_path + f"W1.pkl","wb") as f:
              a = [inum, epoch, W1]
              pkl.dump(a, f)
            with open(data_path + f"W2.pkl","wb") as f:
              a = [inum, epoch, W2]
              pkl.dump(a, f)
          
          hidden_layer = torch.zeros((1, l2_neurons))
          words = sentence.split()
          # Loop through each word in the sentence
          list_of_onehot_vecs = []

          for i, word in enumerate(words):
              numtotal = numtotal+1
              if ((numtotal+1) % batch_size) == 0:
                indic_batch = 1
              # Look up the word vector for the current word
              j = 1
              while(i-j>=0 and j<context):
                  target_word_vector = torch.zeros((1, vocab_size))
                  target_word_vector[0, vocab.index(words[i - j])] = 1
                  list_of_onehot_vecs.append(target_word_vector)
                  j +=1
              j = 1
              while(i +j < len(words) and j<context):
                  target_word_vector = torch.zeros((1, vocab_size))
                  target_word_vector[0, vocab.index(words[i + j])] = 1
                  list_of_onehot_vecs.append(target_word_vector) 
                  j +=1
              
              if(len(list_of_onehot_vecs)==0):
                  continue
              word_input_vector = torch.mean(torch.cat(list_of_onehot_vecs, dim=0), dim=0).unsqueeze(0)
              total_batch_vecsin.append(word_input_vector)
              target_word_vector = torch.zeros((1, vocab_size))
              target_word_vector[0, vocab.index(words[i])] = 1
              total_batch_vecsout.append(target_word_vector)
              #print(target_word_vector.shape)

          if indic_batch == 1:
              # Initialize the hidden layer activations
              if (numtotal+1)//batch_size % 20 == 0:
                print(" Batch number : ", (numtotal+1)//batch_size, " of Epoch number : ", epoch) 

              word_input_vector = torch.cat(total_batch_vecsin, dim=0)
              target_word_vector = torch.cat(total_batch_vecsout, dim=0)
              
              print("clean", inum)
              total_batch_vecsin=[]
              total_batch_vecsout=[]

              # Compute the input to the hidden layer
              hidden_layer = torch.matmul(word_input_vector, W1)

              # Compute the input to the output layer
              output_layer_input = torch.matmul(hidden_layer, W2)

              # Compute the activations in the output layer
              output_layer = softmax(output_layer_input)

              # Compute the gradients in the output layer
              output_layer_delta = softmax_derivative(output_layer, target_word_vector)   #this fn includes both dL/dO and dO/dNet

              # Compute the error in the hidden layer
              hidden_layer_error = torch.matmul(output_layer_delta, W2.t())

              W2 = W2 - learning_rate * torch.matmul(torch.squeeze(hidden_layer).t(), torch.squeeze(output_layer_delta))
              W1 = W1 - learning_rate * torch.matmul(torch.squeeze(word_input_vector).t(), torch.squeeze(hidden_layer_error))

              indic_batch = 0

        break

    # Return the trained word vectors and weight matrices
    return W1, W2


Mounted at /content/drive


In [None]:
with open(data_path + f"processed_sentences.pkl","rb") as f:
  processed_sentences = pkl.load(f)

W1, W2 = train_cbow_torch(processed_sentences, learning_rate=0.1, epochs=5)
processed_sentences[0]

with open('cbow.npy', 'wb') as f:
    np.save(f, W1)
    np.save(f, W2)



  0%|          | 11/92385 [00:00<14:27, 106.43it/s]

clean 20


  0%|          | 22/92385 [00:00<1:06:04, 23.30it/s]

clean 31


  0%|          | 32/92385 [00:01<1:11:01, 21.67it/s]

clean 42


  0%|          | 43/92385 [00:01<1:07:06, 22.94it/s]

clean 53


  0%|          | 54/92385 [00:02<1:05:58, 23.32it/s]

clean 65


  0%|          | 66/92385 [00:02<1:02:24, 24.65it/s]

clean 71


  0%|          | 72/92385 [00:03<1:16:10, 20.20it/s]

clean 81


  0%|          | 82/92385 [00:03<1:17:41, 19.80it/s]

clean 92


  0%|          | 93/92385 [00:04<1:06:19, 23.19it/s]

clean 105


  0%|          | 106/92385 [00:04<59:35, 25.81it/s] 

clean 115


  0%|          | 116/92385 [00:04<1:01:45, 24.90it/s]

clean 127


  0%|          | 128/92385 [00:05<1:00:49, 25.28it/s]

clean 142


  0%|          | 143/92385 [00:05<55:28, 27.71it/s]  

clean 154


  0%|          | 155/92385 [00:06<54:46, 28.06it/s]

clean 167


  0%|          | 168/92385 [00:06<53:21, 28.81it/s]

clean 180


  0%|          | 181/92385 [00:07<52:39, 29.18it/s]

clean 192


  0%|          | 193/92385 [00:07<54:35, 28.14it/s]

clean 202


  0%|          | 203/92385 [00:07<56:26, 27.22it/s]

clean 217


  0%|          | 218/92385 [00:08<51:36, 29.76it/s]

clean 229


  0%|          | 230/92385 [00:08<52:45, 29.12it/s]

 Batch number :  20  of Epoch number :  0
clean 237


  0%|          | 238/92385 [00:09<55:56, 27.46it/s]

clean 249


  0%|          | 250/92385 [00:09<54:15, 28.30it/s]

clean 263


  0%|          | 270/92385 [00:10<53:49, 28.52it/s]

clean 272


  0%|          | 274/92385 [00:10<1:24:42, 18.12it/s]

clean 283


  0%|          | 284/92385 [00:11<1:36:55, 15.84it/s]

clean 297


  0%|          | 308/92385 [00:12<1:02:34, 24.52it/s]

clean 308


  0%|          | 313/92385 [00:12<1:19:50, 19.22it/s]

clean 322


  0%|          | 323/92385 [00:13<1:09:33, 22.06it/s]

clean 334


  0%|          | 335/92385 [00:13<1:06:37, 23.03it/s]

clean 348


  0%|          | 349/92385 [00:14<58:53, 26.05it/s]  

clean 363


  0%|          | 364/92385 [00:14<51:21, 29.87it/s]

clean 371


  0%|          | 372/92385 [00:15<1:00:53, 25.18it/s]

clean 386


  0%|          | 387/92385 [00:15<51:09, 29.97it/s]  

clean 398


  0%|          | 399/92385 [00:15<56:12, 27.28it/s]

clean 407


  0%|          | 408/92385 [00:16<1:05:24, 23.44it/s]

clean 417


  0%|          | 418/92385 [00:16<1:04:49, 23.65it/s]

clean 430


  0%|          | 431/92385 [00:17<1:06:55, 22.90it/s]

clean 442


  0%|          | 443/92385 [00:18<1:11:14, 21.51it/s]

clean 455


  0%|          | 456/92385 [00:18<1:03:16, 24.21it/s]

clean 468


  1%|          | 469/92385 [00:18<59:55, 25.57it/s]  

 Batch number :  40  of Epoch number :  0
clean 479


  1%|          | 480/92385 [00:19<1:00:37, 25.27it/s]

clean 491


  1%|          | 492/92385 [00:19<59:58, 25.54it/s]  

clean 505


  1%|          | 506/92385 [00:20<57:36, 26.58it/s]

clean 516


  1%|          | 517/92385 [00:20<55:33, 27.56it/s]

clean 530


  1%|          | 531/92385 [00:21<53:15, 28.75it/s]

clean 539


  1%|          | 540/92385 [00:21<59:17, 25.82it/s]

clean 556


  1%|          | 557/92385 [00:22<51:33, 29.68it/s]

clean 566


  1%|          | 567/92385 [00:22<52:33, 29.12it/s]

clean 576


  1%|          | 577/92385 [00:22<1:01:13, 24.99it/s]

clean 587


  1%|          | 588/92385 [00:23<1:08:04, 22.47it/s]

clean 595


  1%|          | 596/92385 [00:24<1:23:43, 18.27it/s]

clean 609


  1%|          | 610/92385 [00:25<1:23:01, 18.42it/s]

clean 625


  1%|          | 626/92385 [00:25<1:15:59, 20.13it/s]

clean 636


  1%|          | 637/92385 [00:26<1:08:50, 22.21it/s]

clean 648


  1%|          | 649/92385 [00:26<1:04:13, 23.80it/s]

clean 661


  1%|          | 662/92385 [00:26<1:00:43, 25.18it/s]

clean 670


  1%|          | 671/92385 [00:27<1:05:58, 23.17it/s]

clean 682


  1%|          | 683/92385 [00:27<58:05, 26.31it/s]  

clean 688


  1%|          | 689/92385 [00:28<1:15:37, 20.21it/s]

clean 694


  1%|          | 695/92385 [00:28<1:18:10, 19.55it/s]

 Batch number :  60  of Epoch number :  0
clean 706


  1%|          | 707/92385 [00:29<1:05:35, 23.29it/s]

clean 720


  1%|          | 732/92385 [00:29<46:56, 32.54it/s]  

clean 732


  1%|          | 738/92385 [00:30<1:08:57, 22.15it/s]

clean 744


  1%|          | 745/92385 [00:30<1:27:03, 17.54it/s]

clean 754


  1%|          | 755/92385 [00:31<1:33:52, 16.27it/s]

clean 766


  1%|          | 767/92385 [00:32<1:33:27, 16.34it/s]

clean 780


  1%|          | 781/92385 [00:32<1:16:29, 19.96it/s]

clean 797


  1%|          | 798/92385 [00:33<1:00:05, 25.40it/s]

clean 808


  1%|          | 809/92385 [00:33<59:21, 25.72it/s]  

clean 817


  1%|          | 818/92385 [00:34<1:04:11, 23.78it/s]

clean 824


  1%|          | 825/92385 [00:34<1:07:45, 22.52it/s]

clean 833


  1%|          | 834/92385 [00:34<1:12:14, 21.12it/s]

clean 841


  1%|          | 842/92385 [00:35<1:15:35, 20.18it/s]

clean 852


  1%|          | 853/92385 [00:36<1:20:48, 18.88it/s]

clean 865


  1%|          | 866/92385 [00:36<1:14:51, 20.38it/s]

clean 876


  1%|          | 888/92385 [00:37<58:31, 26.06it/s]  

clean 888
clean 892


  1%|          | 893/92385 [00:38<1:56:24, 13.10it/s]

clean 902


  1%|          | 903/92385 [00:39<1:40:07, 15.23it/s]

clean 915


  1%|          | 916/92385 [00:39<1:21:43, 18.65it/s]

 Batch number :  80  of Epoch number :  0
clean 926


  1%|          | 927/92385 [00:39<1:13:15, 20.81it/s]

clean 941


  1%|          | 942/92385 [00:40<1:03:23, 24.04it/s]

clean 957


  1%|          | 958/92385 [00:40<55:18, 27.55it/s]  

clean 969


  1%|          | 970/92385 [00:41<56:36, 26.92it/s]

clean 982


  1%|          | 983/92385 [00:41<53:33, 28.45it/s]

clean 994


  1%|          | 995/92385 [00:42<55:12, 27.59it/s]

clean 1009


  1%|          | 1010/92385 [00:42<50:27, 30.18it/s]

clean 1017


  1%|          | 1018/92385 [00:42<55:26, 27.47it/s]

clean 1027


  1%|          | 1028/92385 [00:43<1:01:22, 24.81it/s]

clean 1039


  1%|          | 1040/92385 [00:44<1:04:12, 23.71it/s]

clean 1046


  1%|          | 1047/92385 [00:44<1:03:38, 23.92it/s]

clean 1058


  1%|          | 1059/92385 [00:44<58:42, 25.93it/s]  

clean 1070


  1%|          | 1071/92385 [00:45<56:39, 26.86it/s]

clean 1082


  1%|          | 1083/92385 [00:45<1:10:52, 21.47it/s]

clean 1094


  1%|          | 1095/92385 [00:46<1:07:15, 22.62it/s]

clean 1103


  1%|          | 1104/92385 [00:46<1:09:10, 21.99it/s]

clean 1115


  1%|          | 1116/92385 [00:47<1:02:45, 24.24it/s]

clean 1131


  1%|          | 1132/92385 [00:47<51:19, 29.64it/s]  

clean 1139


  1%|          | 1140/92385 [00:47<56:53, 26.73it/s]

clean 1149


  1%|          | 1150/92385 [00:48<57:59, 26.22it/s]

 Batch number :  100  of Epoch number :  0
clean 1160


  1%|▏         | 1161/92385 [00:48<59:47, 25.43it/s]

clean 1171


  1%|▏         | 1172/92385 [00:49<1:06:00, 23.03it/s]

clean 1183


  1%|▏         | 1195/92385 [00:50<53:14, 28.54it/s]  

clean 1196


  1%|▏         | 1200/92385 [00:50<1:10:10, 21.66it/s]

clean 1208


  1%|▏         | 1209/92385 [00:51<1:22:32, 18.41it/s]

clean 1224


  1%|▏         | 1225/92385 [00:51<1:11:43, 21.18it/s]

clean 1234


  1%|▏         | 1235/92385 [00:52<1:05:41, 23.12it/s]

clean 1245


  1%|▏         | 1246/92385 [00:52<1:03:02, 24.10it/s]

clean 1258


  1%|▏         | 1259/92385 [00:52<56:08, 27.05it/s]  

clean 1272


  1%|▏         | 1273/92385 [00:53<50:32, 30.04it/s]

clean 1285


  1%|▏         | 1286/92385 [00:53<48:16, 31.45it/s]

clean 1297


  1%|▏         | 1298/92385 [00:54<49:40, 30.56it/s]

clean 1307


  1%|▏         | 1308/92385 [00:54<50:23, 30.13it/s]

clean 1314


  1%|▏         | 1315/92385 [00:55<1:06:31, 22.81it/s]

clean 1323


  1%|▏         | 1324/92385 [00:55<1:01:08, 24.82it/s]

clean 1336


  1%|▏         | 1337/92385 [00:55<55:09, 27.51it/s]  

clean 1348


  1%|▏         | 1349/92385 [00:56<52:59, 28.63it/s]

clean 1364


  1%|▏         | 1365/92385 [00:56<45:08, 33.60it/s]

clean 1376


  1%|▏         | 1377/92385 [00:56<46:47, 32.42it/s]

clean 1391


  2%|▏         | 1392/92385 [00:57<43:04, 35.20it/s]

 Batch number :  120  of Epoch number :  0
clean 1402


  2%|▏         | 1403/92385 [00:57<48:25, 31.31it/s]

clean 1412


  2%|▏         | 1413/92385 [00:58<49:05, 30.88it/s]

clean 1424


  2%|▏         | 1425/92385 [00:58<50:07, 30.24it/s]

clean 1438


  2%|▏         | 1439/92385 [00:58<46:37, 32.51it/s]

clean 1451


  2%|▏         | 1452/92385 [00:59<45:28, 33.33it/s]

clean 1462


  2%|▏         | 1463/92385 [00:59<46:43, 32.43it/s]

clean 1473


  2%|▏         | 1474/92385 [00:59<50:01, 30.29it/s]

clean 1482


  2%|▏         | 1483/92385 [01:00<57:05, 26.54it/s]

clean 1491


  2%|▏         | 1492/92385 [01:00<58:51, 25.74it/s]

clean 1501


  2%|▏         | 1502/92385 [01:01<56:52, 26.64it/s]

clean 1512


  2%|▏         | 1513/92385 [01:01<53:30, 28.30it/s]

clean 1529


  2%|▏         | 1530/92385 [01:01<46:18, 32.69it/s]

clean 1539


  2%|▏         | 1540/92385 [01:02<56:33, 26.77it/s]

clean 1554


  2%|▏         | 1555/92385 [01:03<58:56, 25.68it/s]

clean 1562


  2%|▏         | 1570/92385 [01:03<1:01:35, 24.57it/s]

clean 1570


  2%|▏         | 1579/92385 [01:04<1:12:37, 20.84it/s]

clean 1579


  2%|▏         | 1583/92385 [01:05<1:43:30, 14.62it/s]

clean 1591


  2%|▏         | 1592/92385 [01:05<1:28:43, 17.05it/s]

clean 1603


  2%|▏         | 1604/92385 [01:05<1:10:44, 21.39it/s]

clean 1615


  2%|▏         | 1616/92385 [01:06<1:01:43, 24.51it/s]

 Batch number :  140  of Epoch number :  0
clean 1626


  2%|▏         | 1627/92385 [01:06<1:01:33, 24.57it/s]

clean 1637


  2%|▏         | 1638/92385 [01:07<57:38, 26.24it/s]  

clean 1646


  2%|▏         | 1647/92385 [01:07<1:01:40, 24.52it/s]

clean 1657


  2%|▏         | 1658/92385 [01:07<58:58, 25.64it/s]  

clean 1670


  2%|▏         | 1671/92385 [01:08<51:51, 29.16it/s]

clean 1686


  2%|▏         | 1687/92385 [01:08<49:25, 30.58it/s]

clean 1697


  2%|▏         | 1698/92385 [01:08<47:26, 31.86it/s]

clean 1711


  2%|▏         | 1712/92385 [01:09<46:07, 32.77it/s]

clean 1721


  2%|▏         | 1722/92385 [01:09<46:49, 32.27it/s]

clean 1732


  2%|▏         | 1733/92385 [01:10<50:40, 29.82it/s]

clean 1740


  2%|▏         | 1741/92385 [01:10<54:58, 27.48it/s]

clean 1750


  2%|▏         | 1751/92385 [01:10<55:15, 27.33it/s]

clean 1753


  2%|▏         | 1754/92385 [01:11<1:12:33, 20.82it/s]

clean 1763


  2%|▏         | 1764/92385 [01:11<1:10:13, 21.51it/s]

clean 1777


  2%|▏         | 1790/92385 [01:14<3:21:27,  7.49it/s]

clean 1790


  2%|▏         | 1791/92385 [01:16<8:44:42,  2.88it/s]

clean 1801


  2%|▏         | 1802/92385 [01:16<3:31:51,  7.13it/s]

clean 1813


  2%|▏         | 1814/92385 [01:17<2:21:47, 10.65it/s]

clean 1832


In [None]:
 with open('cbow.npy', 'rb') as f:
    W1 = np.load(f)
    W2 = np.load(f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

The vocabulary size is-

In [None]:
len(word_vec)

Analogy Operation

In [None]:
def word_vector(word):
  vec = np.zeros((1, vocab_size))
  vec[0, vocab.index(word)] = 1
  return vec


def valid_analogy_CBOW(W1,W2):

  filename = "Analogy_dataset.txt"

  with open(filename, "r") as file:

    corr = 0
    tot = 0

    for line in file:
        
      [x,y,a,b] = line.split()

      x_vec = word_vector(x)
      y_vec = word_vector(y)
      a_vec = word_vector(a)
      b_vec = word_vector(b)

      input_vec = x_vec + b_vec - a_vec 

      hidden_layer_input = np.dot(input_vec, W1)
      # Compute the activations in the hidden layer
      hidden_layer = sigmoid(hidden_layer_input)
      # Compute the input to the output layer
      output_layer_input = np.dot(hidden_layer, W2)
      # Compute the activations in the output layer
      output_layer = sigmoid(output_layer_input)

      max_value = max(output_layer)
      y_pred = output_layer.index(max_value)

      if y_pred == y:
        corr+=1
      tot+=1

      acc_analogy = corr/tot

      return acc_analogy


In [None]:
##def validate_analogy_skipgram(W1,W2):


In [None]:
# Skip-gram
def train_skipgram(sentences_list, learning_rate=0.1, epochs=1000):
    # Initialize the word vectors for each word in the vocabulary
    vocab = set()
    for sentence in sentences_list:
        for word in sentence.split():
            vocab.add(word)
    vocab = list(vocab)
    print(vocab)
    vocab_size = len(vocab)
    l2_neurons = 300

    # Context size for neighbouring words in sentence
    context = 2

    # Initialize the weight matrices for the hidden and output layers
    W1 = np.random.randn(vocab_size, l2_neurons)
    W2 = np.random.randn(l2_neurons, vocab_size)

    for epoch in range(epochs):
        # Loop through each sentence in the training data
        for sentence in tqdm(sentences_list):

            # Initialize the hidden layer activations
            hidden_layer = np.zeros((1, l2_neurons))
            words = sentence.split()
            # Loop through each word in the sentence
            for i, word in enumerate(words):
                # Look up the word vector for the current word
                list_of_onehot_vecs = []
                j = 1
                while(i-j>=0 and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i - j])] = 1
                    list_of_onehot_vecs.append(target_word_vector)
                    j +=1
                j = 1
                while(i +j < len(words) and j<context):
                    target_word_vector = np.zeros((1, vocab_size))
                    target_word_vector[0, vocab.index(words[i + j])] = 1
                    list_of_onehot_vecs.append(target_word_vector) 
                    j +=1

                word_input_vector = np.zeros((1, vocab_size))
                word_input_vector[0, vocab.index(words[i])] = 1
                
                
                for k in range(len(list_of_onehot_vecs)):
                  # Compute the input to the hidden layer
                  hidden_layer_input = np.dot(word_input_vector, W1)
                  hidden_layer = sigmoid(hidden_layer_input)
                  output_layer_input = np.dot(hidden_layer, W2)
                  output_layer = sigmoid(output_layer_input)

                  target_word_vector = list_of_onehot_vecs[k]

                  # Compute the error in the output layer
                  error = target_word_vector - output_layer
                  # Compute the gradients in the output layer
                  output_layer_delta = -error * sigmoid_derivative(output_layer)
                  # Compute the error in the hidden layer
                  hidden_layer_error = np.dot(output_layer_delta, W2.T)
                  # Compute the gradients in the hidden layer
                  hidden_layer_delta = hidden_layer_error * sigmoid_derivative(hidden_layer)
                  # Update the weights
                  W2 -= learning_rate * np.dot(hidden_layer.T, output_layer_delta)
                  W1 -= learning_rate * np.dot(word_input_vector.T, hidden_layer_delta)


    # Return the trained word vectors and weight matrices
    return W1, W2


In [None]:
W1, W2 = train_skipgram(processed_sentences, learning_rate=0.1, epochs=5)
processed_sentences[0]
