<a href="https://colab.research.google.com/github/aditya161205/NLP/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Coding a word2vec encoder**

work flow:

*   Import the libraries and create a sample corpus
*   Create Tokenizer function
*   creating vocab
*   creating word2idx, idx2word
*   doing one_hot_encoding
*   generating skip gram pairs
*   initializing the model
*   coding forward pass
*   backpropogation
*   training loop





In [92]:
import numpy as np
import re

In [93]:
corpus = "This is my word to vector code."

In [94]:
def tokenizer(corpus):
  corpus=corpus.lower()
  corpus=re.sub(r'[^\w\s]',"",corpus)
  tokens=corpus.split()
  return tokens

In [95]:
tokens=tokenizer(corpus)
vocab=set(tokens)
print(vocab)

n=len(vocab)



{'to', 'code', 'my', 'vector', 'word', 'is', 'this'}


In [96]:
word2idx={}

for i,word in enumerate(vocab):
  word2idx[word]=i

print(word2idx)

{'to': 0, 'code': 1, 'my': 2, 'vector': 3, 'word': 4, 'is': 5, 'this': 6}


In [97]:
idx2word={}
for idx,word in enumerate(vocab):
  idx2word[idx]=word

print(idx2word)


{0: 'to', 1: 'code', 2: 'my', 3: 'vector', 4: 'word', 5: 'is', 6: 'this'}


In [98]:
def one_hot_encoding(word):
  vec=np.zeros(n)
  vec[word2idx[word]]=1
  return vec

print(one_hot_encoding("code"))

[0. 1. 0. 0. 0. 0. 0.]


Creating skip gram pairs

In [99]:
def generate_skip_gram_pairs(tokens,window_size=2):
  pairs=[]
  for i,center in enumerate(tokens):
    for j in range(max(0,i-window_size),min(n,i+window_size)):
      if(j!=i):
        pairs.append((center,tokens[j]))
  return pairs
pairs=generate_skip_gram_pairs(tokens)
print(pairs)

[('this', 'is'), ('is', 'this'), ('is', 'my'), ('my', 'this'), ('my', 'is'), ('my', 'word'), ('word', 'is'), ('word', 'my'), ('word', 'to'), ('to', 'my'), ('to', 'word'), ('to', 'vector'), ('vector', 'word'), ('vector', 'to'), ('vector', 'code'), ('code', 'to'), ('code', 'vector')]


In [100]:
embedding_dim=7

W1=np.random.rand(n,embedding_dim)
W2=np.random.rand(embedding_dim,n)

In [101]:
def softmax(x):
  e_x=np.exp(x-max(x))
  return e_x/np.sum(e_x,axis=0)

def forward(x):
  h=np.dot(W1.T,x)
  u=np.dot(W2.T,h)

  return softmax(u),h


In [102]:
def cross_entropy(y_pred, y_true):
    return -np.sum(y_true * np.log(y_pred + 1e-8))

In [103]:
def backprop(x,h,y_pred,y_true,lr=0.01):
  global W1,W2
  loss=cross_entropy(y_pred,y_true)
  error=y_pred-y_true

  dW2=np.outer(h,error)
  dW1=np.outer(x,np.dot(W2,error))


  W1-=lr*dW1
  W2-=lr*dW2

In [104]:
epochs = 2000

for epoch in range(epochs):
    loss = 0
    for center, context in pairs:
        x = one_hot_encoding(center)
        y_true = one_hot_encoding(context)

        y_pred, h = forward(x)
        loss += cross_entropy(y_pred, y_true)
        backprop(x, h, y_pred, y_true)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


Epoch 0, Loss: 34.3848
Epoch 100, Loss: 22.8281
Epoch 200, Loss: 18.0605
Epoch 300, Loss: 17.2028
Epoch 400, Loss: 16.9658
Epoch 500, Loss: 16.8733
Epoch 600, Loss: 16.8298
Epoch 700, Loss: 16.8070
Epoch 800, Loss: 16.7942
Epoch 900, Loss: 16.7866
Epoch 1000, Loss: 16.7819
Epoch 1100, Loss: 16.7789
Epoch 1200, Loss: 16.7769
Epoch 1300, Loss: 16.7755
Epoch 1400, Loss: 16.7744
Epoch 1500, Loss: 16.7736
Epoch 1600, Loss: 16.7729
Epoch 1700, Loss: 16.7722
Epoch 1800, Loss: 16.7715
Epoch 1900, Loss: 16.7709


In [105]:
def get_embedding(word):
    return W1[word2idx[word]]


In [106]:
get_embedding("code")


array([ 0.92162586,  1.72369766,  0.30251172,  0.98986981, -0.71815806,
        0.88116023, -1.31081949])