In [1]:
with open("fr-en-small.txt", "r", encoding="utf-8") as f:
    corpus = f.read().split(".")  

In [2]:
class Vocab:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
    def add(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            
    def __len__(self):
        return len(self.word2idx)

In [3]:
vocab = Vocab()

for line in corpus:
    for word in line.split():
        vocab.add(word)
        
print(f"Vocab size: {len(vocab)}")

Vocab size: 73


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class SkipGramDataSet(Dataset):
    def __init__(self, corpus, word2idx, context_size=2):
        self.data = []
        
        for line in corpus:
            words = line.split()  
            for idx, word in enumerate(words):
                center = word2idx[word]  
                
                context_words = [word2idx[words[j]] for j in range(max(0, idx-context_size), min(len(words), idx+context_size+1)) if j != idx]
                
                for context_word in context_words:
                    self.data.append((center, context_word))  # (201, 643), (201, 224)
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [5]:
batch_size = 4

dataset = SkipGramDataSet(corpus, vocab.word2idx, context_size=2)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=False)

print(f"Dataset size: {len(dataset)}")
print(f"DataLoader size: {len(dataloader)}")

Dataset size: 368
DataLoader size: 92


In [6]:
import torch.nn as nn

class SkipGramModel(nn.Module):
    """ 跳词模型 """
    def __init__(self, vocab_size: int, embedding_size: int):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc = nn.Linear(embedding_size, vocab_size)
        self.log_softmax = nn.Softmax(dim=-1)
        
    def forward(self, input_word: torch.tensor):
        word_embedding = self.embedding(input_word)
        output = self.fc(word_embedding)
        return self.log_softmax(output)

In [7]:
embedding_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SkipGramModel(vocab_size=len(vocab), embedding_size=embedding_size).to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [8]:
def train_model(model, dataloader, optim, criterion):
    total_loss = 0.

    for idx, (x, y) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)
        
        optim.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y)

        loss.backward()
        optim.step()
        total_loss += loss.item()
    return total_loss/idx
    print(f"Loss: {total_loss/idx}")

In [None]:
import matplotlib.pyplot as plt
epoch = 50
train_losses = []

for i in range(epoch):
    train_loss = train_model(model, dataloader, optim, criterion)
    train_losses.append(train_loss)
    
plt.plot(range(1, epoch + 1), train_losses, label='Training Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss over Epochs')
plt.legend()

plt.show

Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 758 ('font.family :sans-serif')
Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 759 ('font.sans-serif :SimHei')
Duplicate key in file WindowsPath('D:/anaconda/lib/site-packages/matplotlib/mpl-data/matplotlibrc'), line 760 ('axes.unicode_minus :False')
