In [1]:
import os
import sys
import transformers
import torch
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('mps')

## gpt-2

source: https://huggingface.co/openai-community/gpt2



In [3]:
model_checkpoint = "openai-community/gpt2"

# Fine Tune Dataset

Source: [https://huggingface.co/datasets/wmt/wmt16/viewer/de-en](https://huggingface.co/datasets/wmt/wmt16/viewer/de-en)

In [4]:
raw_datasets = load_dataset("wmt16", "de-en" )


In [5]:
# from datasets import DatasetDict

# train_subset = raw_datasets["train"].select(range(100))
# validation_subset = raw_datasets["validation"].select(range(100))
# test_subset = raw_datasets["test"].select(range(100))

# raw_datasets = DatasetDict({
#     'train': train_subset,
#     'test': test_subset,
#     'validation': validation_subset
# })

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4548885
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})

In [6]:
sample = raw_datasets['train'][1]
print("Sample of training data")
print("de：", sample['translation']['de'])
print("en：", sample['translation']['en'])

Sample of training data
de： Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
en： I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.


# Preprocessing the data


In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[15496, 11, 428, 318, 257, 6827, 0], [1212, 318, 1194, 6827, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [9]:
max_input_length = 128
max_target_length = 128

source_lang = "de"
target_lang = "en"


tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length,  padding='max_length', truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length,   padding='max_length',truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
preprocess_function(raw_datasets["train"][10:12])



{'input_ids': [[54, 11033, 260, 1658, 3550, 368, 44483, 11, 266, 1697, 48931, 11, 39313, 84, 1736, 11033, 82, 738, 259, 11, 4587, 1736, 11033, 82, 738, 259, 18042, 20872, 28143, 287, 304, 7749, 3059, 260, 571, 268, 288, 292, 15585, 559, 1142, 748, 2547, 2543, 658, 1976, 388, 308, 413, 282, 912, 41763, 309, 375, 18042, 2332, 35906, 50210, 77, 4131, 44949, 3318, 290, 14226, 347, 25151, 70, 1142, 18042, 20872, 28143, 6184, 120, 527, 20124, 45542, 3318, 264, 494, 257, 1648, 585, 1142, 266, 9116, 4372, 268, 11, 477, 274, 287, 1312, 71, 2787, 13685, 11033, 14785, 2876, 15631, 68, 1976, 84, 6278, 11, 23781, 299, 620, 304, 7274, 23018, 677, 831, 406, 9101, 9854, 10564, 263, 384, 11840, 5513, 86, 959, 9324, 49465, 1976, 84, 6522, 831], [33186, 11, 2332, 81, 13922, 11, 220, 488, 2853, 365, 11, 12379, 39683, 304, 500, 4587, 433, 10045, 18362, 288, 2575, 8717, 281, 469, 1671, 19725, 318, 83, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [12]:
from torch.utils.data import DataLoader
from datasets import load_metric
from transformers import DataCollatorWithPadding
tokenized_datasets = tokenized_datasets.remove_columns(raw_datasets["train"].column_names)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)

In [13]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4548885
})

In [14]:
model = GPT2LMHeadModel.from_pretrained(model_checkpoint)

In [15]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)  # num of batches * num of epochs
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,  # scheduler是针对optimizer的lr的
    num_warmup_steps=0,
    num_training_steps=num_training_steps)




In [17]:
from tqdm import tqdm

for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        # 要在GPU上训练，需要把数据集都移动到GPU上：
        batch = {k:v.to(device) for k,v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


  0%|          | 169/568611 [05:32<310:28:41,  1.97s/it]


KeyboardInterrupt: 

In [None]:
from datasets import load_metric

metric = load_metric("sacrebleu")

# 将模型设置为评估模式
model.eval()

# 初始化predictions和references列表
predictions_list = []
references_list = []

# 遍历评估数据集
for batch in eval_dataloader:
    # 将batch移动到GPU上
    batch = {k: v.to(device) for k, v in batch.items()}

    # 使用模型进行推理
    with torch.no_grad():
        outputs = model(**batch)

    # 获取预测结果和参考答案
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # 将预测结果和参考答案转换为字符串列表
    predictions_str = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.tolist()]
    references_str = [tokenizer.decode(ref, skip_special_tokens=True) for ref in batch["labels"].tolist()]

    # 将当前batch的预测结果和参考答案添加到总的列表中
    predictions_list.extend(predictions_str)
    references_list.extend(references_str)

    # 添加当前batch的原始inputs和targets到metric中
    metric.add_batch(predictions=batch['input_ids'], references=batch["labels"])

# 计算BLEU分数
bleu_score = metric.compute()
print("BLEU score:", bleu_score)
