In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import random

In [9]:
def tokenize(text, vocab, max_len):
    tokens = [vocab.get(char, vocab['<unk>']) for char in text]
    tokens = tokens[:max_len-1] + [vocab['<eos>']] # Add end-of-sequence token
    return tokens

class TranslationDataset(Dataset):
    def __init__(self, input_texts, target_texts, input_vocab, target_vocab, max_len):
        self.input_texts = [tokenize(text, input_vocab, max_len) for text in input_texts]
        self.target_texts = [tokenize(text, target_vocab, max_len) for text in target_texts]
        self.max_len = max_len

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]
        return torch.tensor(input_text, dtype=torch.long), torch.tensor(target_text, dtype=torch.long)

In [222]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size,batch_first=True)

    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.lstm(embedded)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))

In [224]:
# Parameters
input_size = 1000
hidden_size = 256

# Encoder
encoder = Encoder(input_size, hidden_size)
initial_hidden = encoder.initHidden()
print('hidden shape',initial_hidden[0].shape)

# Input data
input_indices = [[1,2]]
input_tensor = torch.tensor(input_indices, dtype=torch.long)

print('input shape',input_tensor.shape)
# Encoding

input_word = input_tensor
output, hidden = encoder(input_word)
initial_hidden = hidden
print(f"Output of encoder shape {output.shape}")
print('hidden',hidden[0].shape)


hidden shape torch.Size([1, 1, 256])
input shape torch.Size([1, 2])
Output of encoder shape torch.Size([1, 2, 256])
hidden torch.Size([1, 1, 256])


In [225]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size,batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = torch.relu(output)
        # print('output shape',output.shape)
        output, hidden = self.lstm(output, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

In [226]:
# Parameters
hidden_size = 256
output_size = 1000
start_input = torch.tensor([[0,1,2]], dtype=torch.long)

decoder = Decoder(hidden_size, output_size)

# Sequence generation
num_steps = 10
current_input = start_input
current_hidden = initial_hidden

print('input shape',start_input.shape,'initial_hidden shape',initial_hidden[0].shape)

output, current_hidden = decoder(current_input, current_hidden)
print('output shape',output.shape)

input shape torch.Size([1, 3]) initial_hidden shape torch.Size([1, 1, 256])
output shape torch.Size([1, 3, 1000])


In [229]:

# Parameters
input_size = 2012
hidden_size = 256
encoder = Encoder(input_size, hidden_size)
initial_hidden = encoder.initHidden()
print('hidden shape',initial_hidden[0].shape)
input_indices = [[1,2]]
input_tensor = torch.tensor(input_indices, dtype=torch.long)
input_word = input_tensor
output, hidden = encoder(input_word)
hidden_size = 256
output_size = 1000

decoder = Decoder(hidden_size, output_size)


output1, hidden = encoder(input_word)

target_input = torch.tensor([[0]], dtype=torch.long)
start_input = []
output2, current_hidden = decoder(target_input, hidden)

print(output2.shape)


hidden shape torch.Size([1, 1, 256])
torch.Size([1, 1, 1000])


In [230]:
x = torch.tensor([[1,2,4],
                   [2,7,8],
                 [3,4,7]])
x.topk(1)

torch.return_types.topk(
values=tensor([[4],
        [8],
        [7]]),
indices=tensor([[2],
        [2],
        [2]]))

In [231]:
def tokenize(text):
    # 定义分割符号（空格和标点符号）
    delimiters = [' ', ',', '.', ':', ';', '!', '?', '(', ')', '[', ']', '{', '}', "'", '"']

    # 初始化单词列表和当前单词
    words = []
    current_word = ''

    # 遍历文本中的每个字符
    for char in text:
        if char in delimiters:
            # 如果当前单词不为空，将其添加到单词列表中
            if current_word:
                words.append(current_word)
                current_word = ''
            
            # 如果当前字符是分割符号，将其作为单独的单词添加到单词列表中
            if char != ' ':
                words.append(char)
        else:
            # 如果当前字符不是分割符号，将其追加到当前单词中
            current_word += char

    # 如果最后一个单词不为空，将其添加到单词列表中
    if current_word:
        words.append(current_word)

    return words

In [141]:
en_path = '/Users/liuchu/linear_algebra_strang/translate/中英翻译数据集/train/news-commentary-v13.zh-en.en'
zh_path = '/Users/liuchu/linear_algebra_strang/translate/中英翻译数据集/train/news-commentary-v13.zh-en.zh'

In [153]:
def get_en_sentences():
    arr = []
    with open(en_path,'r') as f:
        count = 0
        for line in f.readlines():
            arr.append(tokenize(line.strip()))
    return arr

In [152]:
def get_zh_sentences():
    arr = []
    with open(zh_path,'r') as f:
        count = 0
        for line in f.readlines():
            arr.append(line.strip().split())
    return arr

In [154]:
en_sentences = get_en_sentences()
zh_sentences = get_zh_sentences()

In [158]:
src_sentences = en_sentences
target_sentences = zh_sentences

In [200]:
src_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
src_set = list(set(word  for sentence in src_sentences for word in sentence))
for i,word in enumerate(src_set):
    src_vocab[word] = 1 + len(src_vocab)

In [193]:
len(src_set)

97323

In [201]:
target_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
target_set = list(set(word  for sentence in target_sentences for word in sentence))
for i,word in enumerate(target_set):
    target_vocab[word] = 1 + len(target_vocab)

In [195]:
len(target_set)

93262

In [202]:
src_index = []
for sen in src_sentences:
    src_index.append([src_vocab[word] for word in sen])

In [203]:
target_index = []
for sen in target_sentences:
    target_index.append([target_vocab[word] for word in sen])

In [259]:
import torch
import torch.nn as nn
import torch.optim as optim


# 模型参数
input_size = len(src_vocab)
output_size = len(target_vocab)
hidden_size = 256

# 初始化模型
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(hidden_size, output_size)

# 损失函数和优化器
criterion = nn.NLLLoss()
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)

# train 函数定义
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):

    #### input_tensor Batch Seqlen
    # print('input_tensor, target_tensor ....',input_tensor.shape, target_tensor.shape)
    # encoder_hidden = encoder.initHidden()
    # print('encoder_hidden,,,,',encoder_hidden[0].shape)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    # print('target_length',target_length)
    loss = 0
    # for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor.unsqueeze(0))
    # print('encoder_output',encoder_output.shape)
    
    decoder_input = torch.tensor([[target_vocab['<sos>']]])
    decoder_hidden = encoder_hidden

    arr = []
    for di in range(target_length+1):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(0).detach()
        arr.append(decoder_output.squeeze(0))
    t = torch.concatenate([target_tensor,torch.tensor([target_vocab['<eos>']])])
    p = torch.concatenate(arr)
    loss = criterion(p,t)
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length



In [264]:
for e in range(100):
    for i in range(1):
        src_tensor = torch.tensor(src_index[i])
        tgt_tensor = torch.tensor(target_index[i])
        # print(src_tensor,tgt_tensor)
        loss = train(src_tensor, tgt_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=10)
        print(f'Training Loss: {loss:.4f}')

Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0012
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0011
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training Loss: 0.0010
Training L

In [265]:
# train 函数定义
def predict(encoder, decoder, input_tensor,target_length=100):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_length = input_tensor.size(0)
    loss = 0
    encoder_output, encoder_hidden = encoder(input_tensor.unsqueeze(0))    
    decoder_input = torch.tensor([[target_vocab['<sos>']]])
    decoder_hidden = encoder_hidden
    arr = []
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(0).detach()
        # print(topi)
        if topi.squeeze().item() == target_vocab['<eos>']:
            print('end of sentence....')
            break
        else:
            arr.append(target_set[topi.squeeze().item()])
    print('ans....',arr)

In [267]:
src_index[0]

[62279, 59133, 55216, 45488]

In [266]:
input_tensor = torch.tensor(src_index[0])
predict(encoder, decoder, input_tensor,target_length=100)

end of sentence....
ans.... ['cristaliz', '使团', '凸显', 'Khomri', '使团', '习惯性']


# 版本2 使用batch_len进行实验

In [None]:
def get_total_data(sentences):
    max_en_len = max(len(sentence) for sentence in en_sentences)
    