<a href="https://colab.research.google.com/github/bhushanmandava/PyTorch_fundamentals/blob/main/vanillaWordEmbeding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import re, torch
import torch.nn as nn

In [36]:
torch.manual_seed(42)

<torch._C.Generator at 0x7e903d55ab30>

In [37]:
docs = [
    "Movies are fun for everyone.",
    "Watching movies is great fun.",
    "Research is interesting and important.",
    "Learning math is very important.",
    "Science discovery is interesting.",
    "Cinema Cinema Cinema Science Science Science Music Music Music Music",
    "Rock is great to listen to.",
    "Listen to music for fun.",
    "Music is fun for everyone.",
    "Listen to folk music!",
]
labels =[1,1,1,3,3,3,2,2,2,2]
num_classes =len(set(labels))# taking all the unique val
print(num_classes)

3


In [38]:
#now starting with our vocab
def tokenize(text):
  return re.findall(r"\w+",text.lower())
def create_vocab(texts):
  tokens = {token for text in texts for token in tokenize(text)}
  # now we have tokens
  return {word:idx for idx,word in enumerate(sorted(tokens))}
v = create_vocab(docs)

In [39]:
v

{'and': 0,
 'are': 1,
 'cinema': 2,
 'discovery': 3,
 'everyone': 4,
 'folk': 5,
 'for': 6,
 'fun': 7,
 'great': 8,
 'important': 9,
 'interesting': 10,
 'is': 11,
 'learning': 12,
 'listen': 13,
 'math': 14,
 'movies': 15,
 'music': 16,
 'research': 17,
 'rock': 18,
 'science': 19,
 'to': 20,
 'very': 21,
 'watching': 22}

In [40]:
#now converting or doc to feature vector
def doctobow(data,vocab):
  bow = torch.zeros(len(vocab))#creation vector with zeros
  tokens = tokenize(data)#spliting data into tokens
  for token in tokens:
    if token in vocab:
      bow[vocab[token]]=1#assing ones based on the possition of word on the vocab data which v
  return bow


In [41]:
bow_vectors = [doctobow(doc, v) for doc in docs]
print(bow_vectors[:3])  # Check the structure


[tensor([0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0.]), tensor([0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1.]), tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0.])]


In [42]:
## full transformations
vectors = torch.stack(
    [doctobow(doc,v) for doc in docs]
)
labels = torch.tensor(labels,dtype=torch.long)-1

In [43]:
print(vectors.shape)
print(labels.shape)

torch.Size([10, 23])
torch.Size([10])


In [44]:
input_dim = vectors.shape[1]
hidden_dim = 50
output_dim = num_classes# getting our loggits befor scaling them in soft max
class simpleClassifier(nn.Module):
  def __init__(self,input_dim,hidden_dim,output_dim):
    super().__init__()
    self.fc1= nn.Linear(input_dim,hidden_dim)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim,output_dim)
  def forward(self,x):
    x= self.fc1(x)
    x=self.relu(x)
    x=self.fc2(x)
    return x
model=simpleClassifier(input_dim,hidden_dim,output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)


In [45]:
for epoch in range(3000):
  model.train()
  optimizer.zero_grad()
  logits =model(vectors)
  loss =criterion(logits,labels)
  loss.backward()
  optimizer.step()
  if epoch % 100 == 0:
    print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 0, Loss: 1.0902403593063354
Epoch 100, Loss: 0.00018075328262057155
Epoch 200, Loss: 8.404947584494948e-05
Epoch 300, Loss: 5.081721974420361e-05
Epoch 400, Loss: 3.435536928009242e-05
Epoch 500, Loss: 2.4854753064573742e-05
Epoch 600, Loss: 1.8834849470295012e-05
Epoch 700, Loss: 1.4817579540249426e-05
Epoch 800, Loss: 1.1944684956688434e-05
Epoch 900, Loss: 9.8227865237277e-06
Epoch 1000, Loss: 7.355181878665462e-06
Epoch 1100, Loss: 5.936601610301295e-06
Epoch 1200, Loss: 4.994853497919394e-06
Epoch 1300, Loss: 4.291522600397002e-06
Epoch 1400, Loss: 3.7431630062201293e-06
Epoch 1500, Loss: 3.2663278943800833e-06
Epoch 1600, Loss: 2.8848596684838412e-06
Epoch 1700, Loss: 2.562995632615639e-06
Epoch 1800, Loss: 2.2649733182333875e-06
Epoch 1900, Loss: 2.086160066028242e-06
Epoch 2000, Loss: 1.8477418279871927e-06
Epoch 2100, Loss: 1.6808492091513472e-06
Epoch 2200, Loss: 1.5139567040023394e-06
Epoch 2300, Loss: 1.358984718535794e-06
Epoch 2400, Loss: 1.2636176052183146e-06
Epoc

In [50]:
#eval stage
test_docs =["listening to music is fun", "am a science geek"]
class_name =["Cinema","music","Science"]
new_doc_vectors = torch.stack(
    [doctobow(doc,v) for doc in test_docs]
)
with torch.no_grad():
  outputs = model(new_doc_vectors)
  _,predicted = torch.max(outputs,1)
for i , new_doc in enumerate(test_docs):
  print(f"Predicted class for '{new_doc}': {class_name[predicted[i]]}")

Predicted class for 'listening to music is fun': music
Predicted class for 'am a science geek': Science
