In [1]:
#preparing the data
from datasets import load_dataset
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [3]:
split_datasets = raw_datasets['train'].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [4]:
split_datasets["validation"] = split_datasets.pop("test")
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [5]:
split_datasets["train"][1]

{'id': '152754',
 'translation': {'en': 'Default to expanded threads',
  'fr': 'Par défaut, développer les fils de discussion'}}

In [6]:
#processing the data
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")



In [7]:
tokenizer.vocab_size

59514

In [8]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}

In [9]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [10]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [11]:
tokenized_datasets['train'][0]['input_ids']

[34378,
 226,
 5783,
 32,
 200,
 12,
 3647,
 4,
 1223,
 1628,
 117,
 4923,
 23608,
 3,
 1789,
 2942,
 20059,
 301,
 548,
 301,
 331,
 30,
 117,
 4923,
 12,
 4,
 1528,
 668,
 3,
 5734,
 212,
 9319,
 30,
 4,
 4923,
 57,
 5487,
 30,
 4,
 6,
 32712,
 25,
 7243,
 1160,
 12,
 621,
 42,
 4,
 1156,
 3009,
 3,
 0]

In [12]:
tokenized_datasets['train'][1]['input_ids']

[47591, 12, 9842, 19634, 9, 0]

In [13]:
from torch.utils.data import DataLoader

In [14]:
import torch

class MyDataCollatorForSeq2Seq():
    """
    convert the input and label into torch.Tensor
    right shift the label using it to the input of decoder
    
    Parameters
    ----------
    pad_token_id: int
        value of id : -100
        
    max_length: int
        max_length of the sentence
        
    Returns: Dict
    -------
        keys: 'input_ids', 'input_attention_mask','decoder_input_ids','labels'
    """
    def __init__(self, pad_token_id=-100, max_length=None):
        self.pad_token_id = pad_token_id
        self.max_length = max_length
        
    def __call__(self, batch):
        # 将输入 ID 和标签 ID 转换为 PyTorch 张量
        input_ids = torch.tensor(batch["input_ids"])
        target_ids = torch.tensor(batch["labels"])
        input_attention_mask = torch.tensor(batch["attention_mask"])
        
        # 将标签 ID 向右位移一个位置，用于解码器的输入
        decoder_input_ids = target_ids[:, :-1].contiguous()
        
        # 准备标签 ID，将填充部分设置为 -100
        labels = target_ids[:, 1:].clone()
        labels[target_ids[:, 1:] == self.pad_token_id] = self.pad_token_id
        
         # 截断输入和标签，如果超出了最大长度
        if self.max_length is not None:
            input_ids = input_ids[:, :self.max_length]
            input_attention_mask = input_attention_mask[:, :self.max_length]
            decoder_input_ids = decoder_input_ids[:, :self.max_length]
            labels = labels[:, :self.max_length]

        return {
            'input_ids': input_ids,
            'input_attention_mask': input_attention_mask,
            'decoder_input_ids': decoder_input_ids,
            'labels': labels,
        }

    

In [20]:
#define my model
import torch
import torch.nn as nn
import torch.optim as optim
import random

class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell


class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = tokenizer.vocab_size

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs
    
    def prepare_decoder_input_ids_from_labels(self, labels):
        """
        Prepare decoder input IDs from labels by right-shifting the labels.
        Args:
            labels (Tensor): Tensor containing label IDs.

        Returns:
            Tensor: Decoder input IDs with right-shifted labels.
        """
        # Right-shift labels to prepare decoder input IDs
        decoder_input_ids = labels.new_zeros(labels.shape)
        decoder_input_ids[:, 1:] = labels[:, :-1]

        return decoder_input_ids

    
# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = tokenizer.vocab_size
input_size_decoder = tokenizer.vocab_size
output_size = tokenizer.vocab_size
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

In [22]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

In [23]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [24]:
batch['labels']

tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])

In [25]:
batch['decoder_input_ids']

tensor([[    0,   577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,
          -100,  -100,  -100,  -100,  -100,  -100],
        [    0,  1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,
           817,   550,  7032,  5821,  7907, 12649]])

In [26]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]
[1211, 3, 49, 9409, 1211, 3, 29140, 817, 3124, 817, 550, 7032, 5821, 7907, 12649, 0]


In [27]:
tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)


In [29]:
batch = next(iter(train_dataloader))
batch_inp = batch['input_ids']
batch_trg = batch['labels']

In [31]:
batch_inp.shape,batch_trg.shape

(torch.Size([8, 28]), torch.Size([8, 38]))

In [32]:
from transformers import AdamW


optimizer = AdamW(model.parameters(), lr=2e-5)



In [33]:
output = model(batch_inp.permute(1,0),batch_trg)

IndexError: index out of range in self