In [278]:
en_path = '/Users/liuchu/linear_algebra_strang/translate/中英翻译数据集/train/news-commentary-v13.zh-en.en'
zh_path = '/Users/liuchu/linear_algebra_strang/translate/中英翻译数据集/train/news-commentary-v13.zh-en.zh'

In [232]:
def get_en_sentences():
    arr = []
    with open(en_path,'r') as f:
        count = 0
        for line in f.readlines():
            arr.append(line.strip())
    return arr

In [233]:
def get_zh_sentences():
    arr = []
    with open(zh_path,'r') as f:
        count = 0
        for line in f.readlines():
            arr.append(line.strip().replace(' ',''))
    return arr

In [234]:
en_sentences = get_en_sentences()
zh_sentences = get_zh_sentences()

In [235]:
src_sentences = en_sentences
target_sentences = zh_sentences

In [236]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 编码器
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# 解码器
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = nn.functional.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [237]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS_token是起始符号的标记

    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()

        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == EOS_token:  # EOS_token是结束符号的标记
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [249]:
def tensorFromSentence(vocab, sentence):
    indices = [vocab[char] for char in sentence if char in vocab]  # Ensure char is in vocab
    indices.append(vocab['<eos>'])  # Append EOS token
    return torch.tensor(indices, dtype=torch.long, device=device).view(-1, 1)

def translate_sentence(sentence, encoder, decoder, input_vocab, output_vocab):
    input_tensor = tensorFromSentence(input_vocab, sentence)
    translated_sentence = translate(encoder, decoder, input_tensor, input_vocab, output_vocab)
    return translated_sentence

def translate(encoder, decoder, input_tensor, input_vocab, output_vocab, max_length=100):
    with torch.no_grad():
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.initHidden()

        # Forward pass through encoder
        for ei in range(input_length):
            _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

        # Decoder's first input is SOS token
        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden
        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_word = output_vocab[topi.item()]
                decoded_words.append(decoded_word)

            decoder_input = topi.squeeze().detach()

        return ' '.join(decoded_words)

In [239]:
# 假设语料数据如下，这里我们使用了非常简单的例子
chinese_sentences = zh_sentences[:20]
english_sentences = en_sentences[:20]

# 为了简化问题，我们不使用词嵌入，而是直接根据字母或汉字进行索引
def create_dict(sentences):
    vocab = set(''.join(sentences))
    char_to_index = {char: i+1 for i, char in enumerate(vocab)}  # +1因为我们留出0作为padding
    char_to_index['<pad>'] = 0
    index_to_char = {i: char for char, i in char_to_index.items()}
    return char_to_index, index_to_char

chinese_vocab, chinese_index_vocab = create_dict(chinese_sentences)
chinese_vocab['<sos>'] = max(chinese_vocab.values()) + 1
chinese_index_vocab[chinese_vocab['<sos>']] = '<sos>'

english_vocab, english_index_vocab = create_dict(english_sentences)

# 将句子转换为索引
def sentence_to_indices(sentence, vocab):
    return [vocab[char] for char in sentence]

chinese_data = [sentence_to_indices(sentence, chinese_vocab) for sentence in chinese_sentences]
english_data = [sentence_to_indices(sentence, english_vocab) for sentence in english_sentences]

# 增加开始和结束标记

SOS_token = english_vocab['<sos>'] = max(english_vocab.values()) + 1
EOS_token = english_vocab['<eos>'] = max(english_vocab.values()) + 1
english_index_vocab[SOS_token] = '<sos>'
english_index_vocab[EOS_token] = '<eos>'
english_data = [[SOS_token] + sentence + [EOS_token] for sentence in english_data]

In [241]:
len(english_index_vocab),len(english_vocab)

(61, 61)

In [242]:
len(chinese_vocab)

353

In [243]:
# 将数据转换为torch张量
def to_tensor(data):
    return torch.tensor(data, dtype=torch.long, device=device).view(-1, 1)

chinese_tensors = [to_tensor(data) for data in chinese_data]
english_tensors = [to_tensor(data) for data in english_data]

# 训练参数设置
num_epochs = 100

encoder = Encoder(len(chinese_vocab), 256).to(device)
decoder = Decoder(256, len(english_vocab)).to(device)



In [245]:

learning_rate = 0.001
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()


In [263]:
# 训练循环
for epoch in range(num_epochs+200):
    total_loss = 0
    for ch_tensor, en_tensor in zip(chinese_tensors, english_tensors):
        loss = train(ch_tensor, en_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {total_loss / len(chinese_sentences):.4f}')

Epoch 10, Loss: 1.2947
Epoch 20, Loss: 1.2528
Epoch 30, Loss: 1.2216
Epoch 40, Loss: 1.1722
Epoch 50, Loss: 1.1268
Epoch 60, Loss: 1.1587
Epoch 70, Loss: 1.1772
Epoch 80, Loss: 1.0730
Epoch 90, Loss: 1.0060
Epoch 100, Loss: 1.0074
Epoch 110, Loss: 1.0013
Epoch 120, Loss: 0.9668
Epoch 130, Loss: 0.9527
Epoch 140, Loss: 0.8626
Epoch 150, Loss: 0.8662
Epoch 160, Loss: 0.8606
Epoch 170, Loss: 0.8620
Epoch 180, Loss: 0.8199
Epoch 190, Loss: 0.7778
Epoch 200, Loss: 2.5917
Epoch 210, Loss: 2.4215


KeyboardInterrupt: 

In [264]:
zh_sentences[:10]

['1929年还是1989年?',
 '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。',
 '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。',
 '如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。',
 '目前的趋势是，要么是过度的克制（欧洲），要么是努力的扩展（美国）。',
 '欧洲在避免债务和捍卫欧元的名义下正变得谨慎，而美国已经在许多方面行动起来，以利用这一理想的时机来实行急需的结构性改革。',
 '然而，作为地域战略学家，无论是从政治意义还是从经济意义上，让我自然想到的年份是1989年。',
 '当然，雷曼兄弟公司的倒闭和柏林墙的倒塌没有任何关系。',
 '事实上，从表面上看，两者似乎是完全是相反的：一个是象征着压抑和人为分裂的柏林墙的倒塌，而另一个是看似坚不可摧的并令人安心的金融资本主义机构的倒塌。',
 '然而，和1989年一样，2008-2009年很可能也能被视为一个划时代的改变，其带来的发人深省的后果将在几十年后仍能让我们感受得到。']

In [265]:
en_sentences[:10]

['1929 or 1989?',
 'PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.',
 'At the start of the crisis, many people likened it to 1982 or 1973, which was reassuring, because both dates refer to classical cyclical downturns.',
 'Today, the mood is much grimmer, with references to 1929 and 1931 beginning to abound, even if some governments continue to behave as if the crisis was more classical than exceptional.',
 'The tendency is either excessive restraint (Europe) or a diffusion of the effort (the United States).',
 'Europe is being cautious in the name of avoiding debt and defending the euro, whereas the US has moved on many fronts in order not to waste an ideal opportunity to implement badly needed structural reforms.',
 'For geo-strategists, however, the year that naturally comes to mind, in both politics and economics, is 1989.',
 'Of course, the fall of the house of Lehman Brothers 

In [277]:
def translate_sentence(sentence):
    indices = sentence_to_indices(sentence, chinese_vocab)
    tensor = to_tensor(indices)
    translation = translate(encoder, decoder, tensor, chinese_index_vocab, english_index_vocab)
    return translation

# 翻译示例
print(translate_sentence('1999年还是1999年?'))

<sos> 1 9 2 9   o r   1 9 8 9 ? <EOS>


# batch size 训练

In [295]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 编码器
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)  # Set batch_first to True

    def forward(self, input, hidden):
        embedded = self.embedding(input)  # input: (batch_size, seq_len)
        output, hidden = self.gru(embedded, hidden)  # No need to permute
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

# 解码器
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)  # Set batch_first to True
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)  # input: (batch_size,seq)
#         output = output.unsqueeze(1)  # Change to (batch_size, 1, hidden_size)
        output = nn.functional.relu(output)
        output, hidden = self.gru(output, hidden)  # No need to permute
        output = self.softmax(self.out(output.squeeze(1)))  # Squeeze to remove the sequence length dimension
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [311]:
import torch

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10):
    batch_size = input_tensor.size(0)
    input_length = input_tensor.size(1)
    target_length = target_tensor.size(1)

    encoder_hidden = encoder.initHidden(batch_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    # Encode the input sequence batch
    encoder_outputs, encoder_hidden = encoder(input_tensor, encoder_hidden)

    # Prepare the initial decoder input (start with SOS tokens for each sequence in the batch)
    decoder_input = torch.tensor([[SOS_token] * batch_size], device=device).transpose(0, 1)  # Shape: (batch_size, 1)
    decoder_hidden = encoder_hidden

    # Assuming teacher forcing is not used here. If desired, it can be included with a certain probability.
    for di in range(target_length):
#         print(decoder_input.shape)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.detach()  # Prepare next input
#         print(decoder_output.shape,target_tensor[:, di].shape,decoder_input.shape)

        loss += criterion(decoder_output, target_tensor[:, di])

        # Optionally, stop when all sequences in the batch reach EOS token
        if (decoder_input == EOS_token).all():
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / (target_length * batch_size)  # Normalize loss by total number of elements

In [312]:
import torch
import torch.nn as nn
import torch.optim as optim

# 设备配置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
input_size = 10  # Example vocabulary size for encoder
output_size = 10  # Example vocabulary size for decoder
hidden_size = 256
batch_size = 5
max_length = 7  # Maximum length of input and output sequences
epochs = 10

# Special tokens
SOS_token = 0  # Start-of-sequence token
EOS_token = 1  # End-of-sequence token

# Instantiate models
encoder = Encoder(input_size, hidden_size).to(device)
decoder = Decoder(hidden_size, output_size).to(device)

# Optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.01)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.01)

# Loss function
criterion = nn.NLLLoss()

In [313]:
def generate_fake_batch(batch_size, max_length, vocabulary_size):
    """ Generate a random batch of sequences for training. """
    # Random sequences of random lengths
    sequences = torch.randint(2, vocabulary_size, (batch_size, max_length)).to(device)
    return sequences

# Example of generating a batch
input_tensor = generate_fake_batch(batch_size, max_length, input_size)
target_tensor = generate_fake_batch(batch_size, max_length, output_size)

In [314]:
input_tensor

tensor([[2, 2, 4, 9, 8, 2, 7],
        [6, 6, 7, 6, 8, 2, 5],
        [9, 5, 6, 7, 3, 4, 4],
        [4, 5, 5, 5, 7, 7, 5],
        [3, 5, 6, 2, 5, 2, 2]])

In [315]:
def train_epoch(encoder, decoder, input_tensor, target_tensor):
    loss_total = 0  # Track loss

    for epoch in range(epochs):
        # Generate new data for each epoch
        input_tensor = generate_fake_batch(batch_size, max_length, input_size)
        target_tensor = generate_fake_batch(batch_size, max_length, output_size)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length)
        loss_total += loss

        print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

    print(f'Average Loss: {loss_total / epochs:.4f}')

# Call the training function
train_epoch(encoder, decoder, input_tensor, target_tensor)

Epoch 1, Loss: 0.4680
Epoch 2, Loss: 0.5614
Epoch 3, Loss: 0.4901
Epoch 4, Loss: 0.5791
Epoch 5, Loss: 0.5343
Epoch 6, Loss: 0.4939
Epoch 7, Loss: 0.4654
Epoch 8, Loss: 0.4840
Epoch 9, Loss: 0.4593
Epoch 10, Loss: 0.4287
Average Loss: 0.4964
