<a href="https://colab.research.google.com/github/bakjjhh/cbow_chord_gen/blob/main/cbow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# CBOW 모델 정의
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)  # (batch_size, context_size, embedding_dim)
        embeds_mean = torch.mean(embeds, dim=1)  # (batch_size, embedding_dim)
        out = self.linear1(embeds_mean)
        out = torch.relu(out)
        out = self.linear2(out)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs

# 학습 데이터 준비
CONTEXT_SIZE = 2  # 주변 단어의 개수
raw_text = "I am learning PyTorch for natural language processing".split()
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2]]
    target = raw_text[i]
    data.append((context, target))

# 단어와 인덱스 사전 생성
vocab = set(raw_text)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# 모델 인스턴스 생성
vocab_size = len(vocab)
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim, CONTEXT_SIZE)

# 손실 함수와 옵티마이저 설정
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# 학습 진행
for epoch in range(100):
    total_loss = 0
    for context, target in data:
        # 입력 데이터 준비
        context_idxs = torch.tensor([word_to_idx[w] for w in context], dtype=torch.long).unsqueeze(0)

        # 모델의 forward 연산 및 손실 계산
        model.zero_grad()
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_idx[target]], dtype=torch.long))

        # 역전파 및 가중치 업데이트
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print('Epoch:', epoch, 'Loss:', total_loss)

# 임베딩 벡터 출력
embedding_weights = model.embeddings.weight.data
for i, word in enumerate(vocab):
    print('Word:', word, 'Embedding:', embedding_weights[i])


Epoch: 0 Loss: 8.326865077018738
Epoch: 1 Loss: 8.290189266204834
Epoch: 2 Loss: 8.253683805465698
Epoch: 3 Loss: 8.217411637306213
Epoch: 4 Loss: 8.181631088256836
Epoch: 5 Loss: 8.145939707756042
Epoch: 6 Loss: 8.110459089279175
Epoch: 7 Loss: 8.07521903514862
Epoch: 8 Loss: 8.040315747261047
Epoch: 9 Loss: 8.005446195602417
Epoch: 10 Loss: 7.970909237861633
Epoch: 11 Loss: 7.936586380004883
Epoch: 12 Loss: 7.902398228645325
Epoch: 13 Loss: 7.868391394615173
Epoch: 14 Loss: 7.834531664848328
Epoch: 15 Loss: 7.80071222782135
Epoch: 16 Loss: 7.7666285037994385
Epoch: 17 Loss: 7.732800245285034
Epoch: 18 Loss: 7.699065923690796
Epoch: 19 Loss: 7.665529251098633
Epoch: 20 Loss: 7.632246971130371
Epoch: 21 Loss: 7.5990952253341675
Epoch: 22 Loss: 7.56603741645813
Epoch: 23 Loss: 7.5330469608306885
Epoch: 24 Loss: 7.500448942184448
Epoch: 25 Loss: 7.467903017997742
Epoch: 26 Loss: 7.435391068458557
Epoch: 27 Loss: 7.403200268745422
Epoch: 28 Loss: 7.3711360692977905
Epoch: 29 Loss: 7.33920