In [5]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

In [6]:
ds = Dataset.load_from_disk("/root/autodl-tmp/weitiao/data/alpaca_data_zh")
ds

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 26858
})

In [3]:

tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/modelscope/Llama-2-7b-ms")
tokenizer

LlamaTokenizerFast(name_or_path='/root/autodl-tmp/modelscope/Llama-2-7b-ms', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

In [7]:
tokenizer("沅", add_special_tokens=False) 

{'input_ids': [29871, 233, 181, 136], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [11]:
tokenizer("abc " + tokenizer.eos_token)

{'input_ids': [1, 25638, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [10]:
tokenizer(tokenizer.eos_token)

{'input_ids': [1, 2], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [8]:
tokenizer("abc" + tokenizer.eos_token)

{'input_ids': [1, 25638, 829, 29879, 29958], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [9]:
tokenizer

LlamaTokenizerFast(name_or_path='/root/autodl-tmp/modelscope/Llama-2-7b-ms', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

In [None]:
tokenizer.padding_side = "right" 

In [8]:
tokenizer

LlamaTokenizerFast(name_or_path='/root/autodl-tmp/modelscope/Llama-2-7b-ms', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False)

In [10]:
tokenizer.pad_token_id = 2

In [11]:
def process_func(example):
    MAX_LENGTH = 400
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ", add_special_tokens=False)
    response = tokenizer(example["output"], add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.eos_token_id] ## </s>
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [12]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

Map:   0%|          | 0/26858 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26858
})

In [13]:
from torch.utils.data import DataLoader
dl = DataLoader(tokenized_ds, batch_size=2, collate_fn=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))
ipt = next(enumerate(dl))[1]
ipt

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[12968, 29901, 29871, 30982, 31695, 31863, 31577, 30210, 30457, 30502,
         31302, 30858, 30267,    13,    13,  7900, 22137, 29901, 29871, 29871,
         30651, 30557, 30392, 30982, 31695, 31863, 31577, 30210, 30457, 30502,
         31302, 30858, 30383,    13,    13, 29896, 29889, 29871, 30982, 31695,
         31687, 30988, 31704, 30846, 30267, 31951, 30408,   232,   132,   157,
           236,   131,   133, 30948, 30210, 31687, 30988, 31894, 30846, 30214,
         30847,   233,   152,   166,   233,   176,   168, 30330,   235,   186,
           148,   233,   176,   168, 31391,   233,   187,   187,   233,   182,
           182, 30214, 30815,   231,   194,   134, 31174, 30869,   235,   164,
           131, 31624, 31863, 31577, 30214,   232,   165,   161,   232,   191,
           189,   235,   133,   143,   235,   133,   140, 31074, 31180, 30214,
         31666, 30417, 31931, 30909,   232,   138,   146, 31022, 30988, 30908,
         30267,    13,    13, 29906, 2

In [13]:
import torch

torch.tensor(1e-8)


tensor(1.0000e-08)

In [15]:

torch.tensor(1e-8).half()

tensor(0., dtype=torch.float16)