In [2]:
!pip install gensim torch

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [27]:
from gensim.utils import simple_preprocess
import torch
import torch.nn as nn
import torch.optim as optim

corpus = [
    "Deep learning is a core subject of artificial intelligence",
    "Machine learning is a subbranch of deep learning",
    "Convolutional Neural Network (CNN) is a basic deep neural network in deep learning",
    "Alex and Visual Geometry Group (VGG) neural networks are pre trained deep neural networks",
    "Deep residual network is used in image recognition"
]

tokenized_corpus = [simple_preprocess(line) for line in corpus]
print("Tokenized Corpus:\n", tokenized_corpus)

words = [word for sentence in tokenized_corpus for word in sentence]
vocab = set(words)
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)
print("\nVocab:\n", vocab)
print("\nVocab Size:", vocab_size)

class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(WordEmbeddingModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        output = self.linear(embeds)
        return output

vector_size = 300
model = WordEmbeddingModel(vocab_size, vector_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

training_data = []
for sentence in tokenized_corpus:
    for i, target_word in enumerate(sentence):
        target_idx = word2idx[target_word]
        context_idx = word2idx[target_word]
        training_data.append((context_idx, target_idx))

epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context_idx, target_idx in training_data:
        context_tensor = torch.LongTensor([context_idx])
        target_tensor = torch.LongTensor([target_idx])
        outputs = model(context_tensor)
        loss = criterion(outputs, target_tensor)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch + 1) % 10 == 0:
        print(f'\n Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(training_data):.4f}')

word_embeddings = model.embeddings.weight.data
print("\nWord Embeddings:")
for word in ["deep", "learning", "intelligence", "network"]:
    if word in word2idx:
        print(f"{word}: {word_embeddings[word2idx[word]].numpy()[:8]} ...")

Tokenized Corpus:
 [['deep', 'learning', 'is', 'core', 'subject', 'of', 'artificial', 'intelligence'], ['machine', 'learning', 'is', 'subbranch', 'of', 'deep', 'learning'], ['convolutional', 'neural', 'network', 'cnn', 'is', 'basic', 'deep', 'neural', 'network', 'in', 'deep', 'learning'], ['alex', 'and', 'visual', 'geometry', 'group', 'vgg', 'neural', 'networks', 'are', 'pre', 'trained', 'deep', 'neural', 'networks'], ['deep', 'residual', 'network', 'is', 'used', 'in', 'image', 'recognition']]

Vocab:
 {'in', 'image', 'cnn', 'machine', 'is', 'basic', 'recognition', 'residual', 'networks', 'alex', 'geometry', 'vgg', 'are', 'intelligence', 'and', 'trained', 'used', 'learning', 'convolutional', 'visual', 'neural', 'deep', 'network', 'core', 'pre', 'of', 'subject', 'group', 'artificial', 'subbranch'}

Vocab Size: 30

 Epoch [10/100], Loss: 0.0263

 Epoch [20/100], Loss: 0.0077

 Epoch [30/100], Loss: 0.0037

 Epoch [40/100], Loss: 0.0022

 Epoch [50/100], Loss: 0.0014

 Epoch [60/100], Los