# Getting a corpus

In [2]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("code_search_net", "python")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
raw_datasets["train"]

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [4]:
len(raw_datasets["train"])

412178

In [5]:
raw_datasets["train"][0].keys()

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])

In [6]:
print(raw_datasets["train"][123456]["whole_func_string"])
print(raw_datasets["train"][123456]["whole_func_string"])

def get_new_token(self, netloc):
        """Get a new token from BIG-IP and store it internally.

        Throws relevant exception if it fails to get a new token.

        This method will be called automatically if a request is attempted
        but there is no authentication token, or the authentication token
        is expired.  It is usually not necessary for users to call it, but
        it can be called if it is known that the authentication token has
        been invalidated by other means.
        """
        login_body = {
            'username': self.username,
            'password': self.password,
        }

        if self.auth_provider:
            if self.auth_provider == 'local':
                login_body['loginProviderName'] = 'local'
            elif self.auth_provider == 'tmos':
                login_body['loginProviderName'] = 'tmos'
            elif self.auth_provider not in ['none', 'default']:
                providers = self.get_auth_providers(netloc)
         

In [7]:
DEFAULT_MESSAGE_CALLBACK = lambda x: None
def handle_simple_responses(
      self, timeout_ms=None, info_cb=DEFAULT_MESSAGE_CALLBACK):
    """Accepts normal responses from the device.

    Args:
      timeout_ms: Timeout in milliseconds to wait for each response.
      info_cb: Optional callback for text sent from the bootloader.

    Returns:
      OKAY packet's message.
    """
    return self._accept_responses('OKAY', info_cb, timeout_ms=timeout_ms)

In [8]:
training_corpus = (
    raw_datasets["train"][i : i + 1000]["whole_func_string"]
    for i in range(0, len(raw_datasets["train"]), 1000)
)

In [9]:
gen = (i for i in range(10))
print(list(gen))
print(list(gen))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [10]:
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

In [11]:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

In [12]:
type(training_corpus)

generator

# Building your tokenizer from scratch

To understand how to build your tokenizer from scratch, we have to dive a little bit more in the ü§ó Tokenizers library and the tokenization pipeline. This pipeline takes several steps:  

- **Normalization:** Executes all the initial transformations over the initial input string. For example when you need to lowercase some text, maybe strip it, or even apply one of the common unicode normalization process, you will add a Normalizer.  

- **Pre-tokenization:** In charge of splitting the initial input string. That's the component that decides where and how to pre-segment the origin string. The simplest example would be to simply split on spaces.  

- **Model:** Handles all the sub-token discovery and generation, this is the part that is trainable and really dependent of your input data.

- **Post-Processing:** Provides advanced construction features to be compatible with some of the Transformers-based SoTA models. For instance, for BERT it would wrap the tokenized sentence around [CLS] and [SEP] tokens.


Other directions: 
- **Decoding**: in charge of mapping back a tokenized input to the original string. 

# Hu·∫•n luy·ªán m·ªôt tokenizer m·ªõi

In [13]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [14]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'ƒ†add',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'ƒ†b',
 '):',
 'ƒä',
 'ƒ†',
 'ƒ†',
 'ƒ†',
 'ƒ†"""',
 'Add',
 'ƒ†the',
 'ƒ†two',
 'ƒ†numbers',
 'ƒ†`',
 'a',
 '`',
 'ƒ†and',
 'ƒ†`',
 'b',
 '`',
 '."',
 '""',
 'ƒä',
 'ƒ†',
 'ƒ†',
 'ƒ†',
 'ƒ†return',
 'ƒ†a',
 'ƒ†+',
 'ƒ†b']

In [15]:
old_tokenizer.is_fast

True

In [16]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [17]:
tokens = tokenizer.tokenize(example)
print(tokens)


['def', 'ƒ†add', '_', 'numbers', '(', 'a', ',', 'ƒ†b', '):', 'ƒäƒ†ƒ†ƒ†', 'ƒ†"""', 'Add', 'ƒ†the', 'ƒ†two', 'ƒ†numbers', 'ƒ†`', 'a', '`', 'ƒ†and', 'ƒ†`', 'b', '`."""', 'ƒäƒ†ƒ†ƒ†', 'ƒ†return', 'ƒ†a', 'ƒ†+', 'ƒ†b']


In [18]:
tokenizer.is_fast

True

In [19]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

27
36


In [20]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """
print(tokenizer.tokenize(example))

['class', 'ƒ†Linear', 'Layer', '():', 'ƒäƒ†ƒ†ƒ†', 'ƒ†def', 'ƒ†__', 'init', '__(', 'self', ',', 'ƒ†input', '_', 'size', ',', 'ƒ†output', '_', 'size', '):', 'ƒäƒ†ƒ†ƒ†ƒ†ƒ†ƒ†ƒ†', 'ƒ†self', '.', 'weight', 'ƒ†=', 'ƒ†torch', '.', 'randn', '(', 'input', '_', 'size', ',', 'ƒ†output', '_', 'size', ')', 'ƒäƒ†ƒ†ƒ†ƒ†ƒ†ƒ†ƒ†', 'ƒ†self', '.', 'bias', 'ƒ†=', 'ƒ†torch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'ƒäƒäƒ†ƒ†ƒ†', 'ƒ†def', 'ƒ†__', 'call', '__(', 'self', ',', 'ƒ†x', '):', 'ƒäƒ†ƒ†ƒ†ƒ†ƒ†ƒ†ƒ†', 'ƒ†return', 'ƒ†x', 'ƒ†@', 'ƒ†self', '.', 'weights', 'ƒ†+', 'ƒ†self', '.', 'bias', 'ƒäƒ†ƒ†ƒ†ƒ†']


# Tokenizer nhanh v√† ch·∫≠m

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [22]:
tokenizer.is_fast

True

In [23]:
print(encoding.tokens())

['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']


In [24]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

In [25]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Sylvain'

## B√™n trong pipeline token-classification

pipeline l√† m·ªôt nh√≥m c√°c model ƒë√£ ƒë∆∞·ª£c code s·∫µn 

In [26]:
from transformers import pipeline

token_classifier = pipeline("token-classification")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

2024-05-17 17:07:12.617401: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mo

[{'entity': 'I-PER',
  'score': 0.99938285,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.99815494,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.99590707,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99923277,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.976115,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887976,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9932106,
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

th·ª±c hi·ªán nh√≥m ch√∫ng l·∫°i v·ªõi nhau 

In [27]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [28]:
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="tf")
outputs = model(**inputs)

2024-05-17 17:07:18.188945: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-17 17:07:18.209848: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-17 17:07:18.210021: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [29]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)



(1, 19)
(1, 19, 9)


Ch√∫ng ta c√≥ m·ªôt l√¥ v·ªõi 1 chu·ªói g·ªìm 19 token v√† m√¥ h√¨nh c√≥ 9 nh√£n kh√°c nhau, v√¨ v·∫≠y ƒë·∫ßu ra c·ªßa m√¥ h√¨nh c√≥ h√¨nh d·∫°ng 1 x 19 x 9. Gi·ªëng nh∆∞ ƒë·ªëi v·ªõi pipeline ph√¢n lo·∫°i vƒÉn b·∫£n, ch√∫ng ta s·ª≠ d·ª•ng h√†m softmax ƒë·ªÉ chuy·ªÉn ƒë·ªïi c√°c logits ƒë√≥ theo x√°c su·∫•t, v√† ch√∫ng ta l·∫•y argmax ƒë·ªÉ nh·∫≠n d·ª± ƒëo√°n (l∆∞u √Ω r·∫±ng ta c√≥ th·ªÉ l·∫•y argmax tr√™n logits v√¨ softmax kh√¥ng thay ƒë·ªïi th·ª© t·ª±):

In [30]:
import tensorflow as tf

probabilities = tf.math.softmax(outputs.logits, axis=-1)[0]
probabilities = probabilities.numpy().tolist()
predictions = tf.math.argmax(outputs.logits, axis=-1)[0]
predictions = predictions.numpy().tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]


In [31]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

·ªü ƒë√¢y ƒë·ªãnh d·∫°ng B-PER ch·ªâ ƒë∆∞·ª£c s·ª≠ d·ª•ng ƒë·ªÉ ph√¢n t√°ch 2 t·ª´ kh√°c nhau, d√≤ng th·ª© 2 m√†u h·ªìng ƒë∆∞·ª£c s·ª≠ d·ª•ng trong ho√†n c·∫£nh n√†y (c√≤n ƒë·ªÉ d·ªÖ hi·ªÉu v√† ph√¢n bi·ªát r√µ h∆°n ta s·∫Ω nh√¨n v√†o d√≤ng th·ª© 3)

![image.png](attachment:image.png)

In [32]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9993829727172852, 'word': 'S'}, {'entity': 'I-PER', 'score': 0.998155415058136, 'word': '##yl'}, {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'}, {'entity': 'I-PER', 'score': 0.9992332458496094, 'word': '##in'}, {'entity': 'I-ORG', 'score': 0.9739148616790771, 'word': 'Hu'}, {'entity': 'I-ORG', 'score': 0.976115882396698, 'word': '##gging'}, {'entity': 'I-ORG', 'score': 0.9888299107551575, 'word': 'Face'}, {'entity': 'I-LOC', 'score': 0.9932070374488831, 'word': 'Brooklyn'}]


In [33]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 14),
 (14, 16),
 (16, 18),
 (19, 22),
 (23, 24),
 (25, 29),
 (30, 32),
 (33, 35),
 (35, 40),
 (41, 45),
 (46, 48),
 (49, 57),
 (57, 58),
 (0, 0)]

# S·ª≠ d·ª•ng pipeline question-answering

In [34]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
ü§ó Transformers is backed by the three most popular deep learning libraries ‚Äî Jax, PyTorch, and TensorFlow ‚Äî with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back ü§ó Transformers?"
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9802603125572205,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [35]:
long_context = """
ü§ó Transformers: State of the Art NLP

ü§ó Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

ü§ó Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

ü§ó Transformers is backed by the three most popular deep learning libraries ‚Äî Jax, PyTorch and TensorFlow ‚Äî with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question_answerer(question=question, context=long_context)

{'score': 0.9714871048927307,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

In [36]:
long_context[1892:1919]

'Jax, PyTorch and TensorFlow'

ta s·∫Ω ƒëi s√¢u th√™m v√†o m√¥ h√¨nh, S·ª≠ d·ª•ng m√¥ h√¨nh cho t√°c v·ª• h·ªèi ƒë√°p 

In [37]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="tf") 
outputs = model(**inputs)




All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [38]:
question


'Which deep learning libraries back ü§ó Transformers?'

In [39]:
context

"\nü§ó Transformers is backed by the three most popular deep learning libraries ‚Äî Jax, PyTorch, and TensorFlow ‚Äî with a seamless integration\nbetween them. It's straightforward to train your models with one before loading them for inference with the other.\n"

In [40]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)

(1, 67) (1, 67)


In [41]:
import tensorflow as tf

sequence_ids = inputs.sequence_ids()
# Che t·∫•t c·∫£ m·ªçi th·ª© tr·ª´ token c·ªßa ng·ªØ c·∫£nh
mask = [i != 1 for i in sequence_ids]
# Hi·ªÉn th·ªã token [CLS]
mask[0] = False
mask = tf.constant(mask)[None]

start_logits = tf.where(mask, -10000, start_logits)
end_logits = tf.where(mask, -10000, end_logits)

c√°c b∆∞·ªõc ƒë·ªÉ x·ª≠ l√Ω m·ªôt tokenizer  
<img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" width="50%">

ch√∫ng ta s·∫Ω ti·∫øn h√†nh qua 4 b∆∞·ªõc sau:
- normalization   
- pre-tokenization  
- model  
- post-processing  


In [42]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(type(tokenizer.backend_tokenizer)) 

<class 'tokenizers.Tokenizer'>


Thu·ªôc t√≠nh normalizer c·ªßa ƒë·ªëi t∆∞·ª£ng tokenizer c√≥ ph∆∞∆°ng th·ª©c normalize_str() m√† ta c√≥ th·ªÉ d√πng ƒë·ªÉ th·∫•y c√°ch b∆∞·ªõc chu·∫©n ho√° ƒë∆∞·ª£c th·ª±c hi·ªán:

In [43]:
print(tokenizer.backend_tokenizer.normalizer.normalize_str("H√©ll√≤ h√¥w are √º?"))

hello how are u?


## Pre-tokenization

In [44]:
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('how', (7, 10)),
 ('are', (11, 14)),
 ('you', (16, 19)),
 ('?', (19, 20))]

In [45]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('Hello', (0, 5)),
 (',', (5, 6)),
 ('ƒ†how', (6, 10)),
 ('ƒ†are', (10, 14)),
 ('ƒ†', (14, 15)),
 ('ƒ†you', (15, 19)),
 ('?', (19, 20))]

In [46]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?")

[('‚ñÅHello,', (0, 6)),
 ('‚ñÅhow', (7, 10)),
 ('‚ñÅare', (11, 14)),
 ('‚ñÅyou?', (16, 20))]

# Tri·ªÉn khai BPE

In [47]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

Ti·∫øp theo, ta c·∫ßn ti·ªÅn tokenize kho ng·ªØ li·ªáu n√†y th√†nh c√°c t·ª´. V√¨ ta ƒëang sao ch√©p m·ªôt b·∫£n BPE tokenizer (nh∆∞ GPT-2), ta v·∫´n c√≥ th·ªÉ s·ª≠ d·ª•ng gpt2 tokenize cho b∆∞·ªõc pre-tokenization:

Copied


In [48]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Sau ƒë√≥ ta t√≠nh t·∫ßn su·∫•t c·ªßa t·ª´ng t·ª´ trong kho ng·ªØ li·ªáu nh∆∞ khi l√†m v·ªõi pre-tokenization:

In [49]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'ƒ†is': 2, 'ƒ†the': 1, 'ƒ†Hugging': 1, 'ƒ†Face': 1, 'ƒ†Course': 1, '.': 4, 'ƒ†chapter': 1, 'ƒ†about': 1, 'ƒ†tokenization': 1, 'ƒ†section': 1, 'ƒ†shows': 1, 'ƒ†several': 1, 'ƒ†tokenizer': 1, 'ƒ†algorithms': 1, 'Hopefully': 1, ',': 1, 'ƒ†you': 1, 'ƒ†will': 1, 'ƒ†be': 1, 'ƒ†able': 1, 'ƒ†to': 1, 'ƒ†understand': 1, 'ƒ†how': 1, 'ƒ†they': 1, 'ƒ†are': 1, 'ƒ†trained': 1, 'ƒ†and': 1, 'ƒ†generate': 1, 'ƒ†tokens': 1})


Ti·∫øp theo ch√∫ng ta s·∫Ω t√≠nh b·ªô t·ª´ v·ª±ng c∆° s·ªü t·ª´ c√°c k√≠ t·ª± s·ª≠ d·ª•ng trong kho ng·ªØ li·ªáu:

In [50]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'ƒ†']


In [51]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [52]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'ƒ†']


In [53]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])


ƒêI·ªÇM M·∫†NH:
1. t·∫°o ƒë∆∞·ª£c nh·ªØng th√†nh ph·∫ßn t·ª´ b·ªã thi·∫øu (sub-word)

ƒêI·ªÇM Y·∫æU:
1. kh√¥ng th·ªÉ t·∫°o ƒë∆∞·ª£c t·ª´ m·ªõi

# WordPiece tokenization


ƒêI·ªÇM M·∫†NH:
1. so v·ªõi BPE, tokenizer n√†y h·ªçc c√°c ph·∫ßn c·ªßa t·ª´ nh∆∞ l√† token nhanh h∆°n m·ªôt ch√∫t.



# Unigram tokenization

T·∫°i m·ªói b∆∞·ªõc c·ªßa qu√° tr√¨nh hu·∫•n luy·ªán, thu·∫≠t to√°n Unigram t√≠nh to√°n s·ª± m·∫•t m√°t tr√™n kho ng·ªØ li·ªáu ƒë∆∞·ª£c cung c·∫•p t·ª´ v·ª±ng hi·ªán t·∫°i. Sau ƒë√≥, ƒë·ªëi v·ªõi m·ªói k√Ω hi·ªáu trong t·ª´ v·ª±ng, thu·∫≠t to√°n s·∫Ω t√≠nh to√°n m·ª©c ƒë·ªô t·ªïn th·∫•t t·ªïng th·ªÉ s·∫Ω tƒÉng l√™n bao nhi√™u n·∫øu k√Ω hi·ªáu b·ªã x√≥a v√† t√¨m ki·∫øm c√°c k√Ω hi·ªáu l√†m tƒÉng n√≥ √≠t nh·∫•t.


n·∫øu ch√∫ng ta s·ª≠ d·ª•ng m√¥ h√¨nh ng√¥n ng·ªØ Unigram ƒë·ªÉ t·∫°o vƒÉn b·∫£n, ch√∫ng ta s·∫Ω lu√¥n d·ª± ƒëo√°n token ph·ªï bi·∫øn nh·∫•t.



-----------
GPT3 d√πng word piece

# X√¢y d·ª±ng m·ªôt WordPiece tokenizer t·ª´ ƒë·∫ßu


# Thu th·∫≠p m·ªôt kho ng·ªØ li·ªáu

#### ta s·∫Ω chia nh·ªè th√†nh c√°c batch ƒë·ªÉ kh√¥ng b·ªã over ram memory 

In [54]:
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")


def get_training_corpus():
    '''
    H√†m tr·∫£ v·ªÅ c√°c batch v·ªõi m·ªói batch ch·ª©a 1000 ƒëo·∫°n vƒÉn
    ''' 
    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

In [55]:
with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

In [56]:
len(dataset[5]['text'])

574

In [130]:
for  i in range(10):
    print(len(dataset[i]['text']))

0
30
0
706
524
574
0
19
0
1221


##### X√¢y d·ª±ng m·ªôt WordPiece tokenizer t·ª´ ƒë·∫ßu

In [57]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

ta c·∫ßn ch·ªâ r√µ unknow_token ƒë·ªÉ bi·∫øt m√¥ h√¨nh tr·∫£ v·ªÅ g√¨ khi g·∫∑p token ch∆∞a bi·∫øt. ta c·∫ßn c√†i ƒë·∫∑t max_input_chars_per_word t∆∞∆°ng ·ª©ng ƒë·ªç d√†i t·ªëi da cho m·ªôt t·ª´, t·ª´ d√†i h∆°n gi·ªõi h·∫°n n√†y s·∫Ω b·ªã chia nh·ªè 

b∆∞·ªõc ƒë·∫ßu ch√∫ng ta s·∫Ω ƒëi chu·∫©n ho√°, let's go 

V√¨ BERT ƒë∆∞·ª£c s·ª≠ d·ª•ng r·ªông t√£i, ta c√≥ th·ªÉ s·ª≠ d·ª•ng BertNormalizer v·ªõi tu·ª≥ ch·ªçn kinh ƒëi·ªÉn ƒë·ªÉ thi·∫øt l·∫≠p cho BERT: lowercase v√† strip_accents 

In [58]:
from sklearn.feature_extraction.text import strip_accents_ascii

text = "H√©llo, h√≥w √°re y√≥u?"
text_without_accents = strip_accents_ascii(text)

print(text_without_accents)

Hello, how are you?


In [59]:
# tokenizer.normalizer = normalizers.Sequence(
#     [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
# )

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase()]
)

Ta c≈©ng c√≥ th·ªÉ s·ª≠ d·ª•ng chu·∫©n ho√° Unicode NFD Unicode normalizer, v√¨ n·∫øu kh√¥ng chu·∫©n ho√° StripAccents s·∫Ω kh√¥ng nh·∫≠n di·ªán ƒë∆∞·ª£c nh·ªØng k√≠ t·ª± c√≥ d·∫•u v√† kh√¥ng th·ªÉ t√°ch n√≥ ƒë√∫ng nh∆∞ ta mu·ªën.



In [60]:
print(tokenizer.normalizer.normalize_str("H√©ll√≤ h√¥w are √º?"))
print(tokenizer.normalizer.normalize_str("ok ch√∫ng ta n√™n ƒë·ªÉ nh∆∞ th·∫ø n√†y s·∫Ω hay h∆°n ƒë·∫•y c√°c b·∫°n ·∫°"))

heÃÅlloÃÄ hoÃÇw are uÃà?
ok chuÃÅng ta neÃÇn ƒëeÃÇÃâ nhuÃõ theÃÇÃÅ naÃÄy seÃÉ hay hoÃõn ƒëaÃÇÃÅy caÃÅc baÃ£n aÃ£


***ƒë√†o s√¢u h∆°n ***  
N·∫øu b·∫°n mu·ªën ki·ªÉm tra xem hai phi√™n b·∫£n chu·∫©n ho√° tr∆∞·ªõc ƒë√≥ tr√™n c√πng m·ªçt chu·ªói  k√Ω t·ª± unicode u"\u0085", b·∫°n ch·∫Øc ch·∫Øn s·∫Ω nh·∫≠n th·∫•y r·∫±ng hai c√°ch chu·∫©n ho√° n√†y kh√¥ng gi·ªëng nhau. ƒê·ªÉ tr√°nh ph·ª©c t·∫°p ho√° phi√™n b·∫£n v·ªõi normalizers.Sequence qu√° nhi·ªÅu, ch√∫ng t√¥i s·∫Ω kh√¥ng abao g·ªìm s·ª± thay th·∫ø theo Regex m√† BertNormalizer y√™u c·∫ßu khi tham s·ªó clean_text ƒë∆∞·ª£c thi·∫øt l·∫≠p l√† True -ƒë√¢y c≈©ng l√† gi√° tr·ªã m·∫∑c ƒë·ªãnh. Nh∆∞ng ƒë·ª´ng lo: c√≥ kh·∫£ nƒÉng ta s·∫Ω nh·∫≠n ƒë∆∞·ª£c k·∫øt qu·∫£ chu·∫©n ho√° gi·ªëng nhau m√† kh√¥ng c·∫ßn s·ª≠ d·ª•ng BertNormalizer th·ªß c√¥ng b·∫±ng c√°ch th√™m hai normalizers.Replace v√†o chu·ªói chu·∫©n ho√°.




Ti·∫øp theo l√† b∆∞·ªõc pre-tokenization. M·ªôt l·∫ßn n·ªØa, ta c√≥ BertPreTokenizer ƒë∆∞·ª£c x√¢y d·ª±ng s·∫µn ƒë·ªÉ d√πng:

In [61]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

Ho·∫∑c ta c√≥ th·ªÉ x√¢y t·ª´ ƒë·∫ßu:

In [62]:
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

L∆∞u √Ω r·∫±ng Whitespace s·∫Ω t√°ch theo d·∫•u c√°ch v√† c√°c k√≠ t·ª± kh√¥ng ph·∫£i ch·ªØ c√°i, s·ªë, ho·∫∑c d·∫•u g·∫°ch d∆∞·ªõi, n√™n v·ªÅ m·∫∑t k·ªπ thu·∫≠t n√≥ s·∫Ω t√°ch theo d·∫•u c√°ch v√† d·∫•u c√¢u:

In [63]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

t√°ch theo d·∫•u c√°ch 

In [64]:
pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[("Let's", (0, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre-tokenizer.', (14, 28))]

b·∫°n c√≥ th·ªÉ k·∫øt h·ª£p c√°c pre-tokenizer v·ªõi nhau

In [65]:
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))

pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.Punctuation()]
)
print(pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."))

[('Let', (0, 3)), ("'", (3, 4)), ('s', (4, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre', (14, 17)), ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))]
[('Let', (0, 3)), ("'", (3, 4)), ('s test my pre', (4, 17)), ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))]


B∆∞·ªõc ti·∫øp theo trong pipeline tokenize l√† ƒë∆∞a ƒë·∫ßu v√†o qua m√¥ h√¨nh. Ta ƒë√£ ch·ªâ ƒë·ªãnh m√¥ h√¨nh c·ªßa m√¨nh khi kh·ªüi t·∫°o, nh∆∞ng ta v·∫´n c·∫ßn hu·∫•n luy·ªán n√≥, ƒëi·ªÅu n√†y c·∫ßn t·ªõi WordPieceTrainer.

v·∫•n ƒë·ªÅ ·ªü ƒë√¢y l√† khi kh·ªüi ƒë·ªông m·ªôt tr√¨nh hu·∫•n luy·ªán trong hugging face th√¨ b·∫°n c·∫ßn truy·ªÅn t·∫•t c·∫£ c√°c k√Ω t·ª± ƒë·∫∑c bi·ªát b·∫°n c·∫ßn khi s·ª≠ d·ª•ng,  n·∫øu kh√¥ng n√≥ s·∫Ω kh√¥ng th√™m v√†o b·ªô t·ª´ v·ª±ng, v√¨ ch√∫ng kh√¥ng c√≥ trong kho ng·ªØ li·ªáu hu·∫•n luy·ªán 

V√≠ d·ª•, trong BERT, c√°c token ƒë·∫∑c bi·ªát bao g·ªìm [CLS], [SEP], v√† [MASK]. Nh·ªØng token n√†y c√≥ √Ω nghƒ©a ƒë·∫∑c bi·ªát trong m√¥ h√¨nh v√† kh√¥ng n√™n b·ªã chia nh·ªè. ƒê·ªÉ tr√°nh vi·ªác ch√∫ng b·ªã chia nh·ªè, ta c·∫ßn th√™m ch√∫ng v√†o b·ªô t·ª´ v·ª±ng tr∆∞·ªõc khi hu·∫•n luy·ªán m√¥ h√¨nh.

In [66]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

C≈©ng nh∆∞ vi·ªác ch·ªâ ƒë·ªãnh vocab_size v√† special_tokens, ta c·∫ßn thi·∫øt l·∫≠p min_frequency (s·ªë l·∫ßn m·ªôt token ph·∫£i xu·∫•t hi·ªán ƒë·ªÉ ƒë∆∞·ª£c th√™m v√†o b·ªô t·ª´ v·ª±ng) ho·∫∑c thay ƒë·ªïi continuing_subword_prefix (n·∫øu ta mu·ªën s·ª≠ d·ª•ng th·ª© g√¨ kh√°c ngo√†i ##). 

ƒê·ªÉ hu·∫•n luy·ªán m·ªôt m√¥ h√¨nh s·ª≠ d·ª•ng tr√¨nh l·∫∑p ta ƒë·ªãnh nghƒ©a tr∆∞·ªõc ƒë√≥, ta ch·ªâ c·∫ßn th·ª±c hi·ªán l·ªánh n√†y:

In [67]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






ch√∫ng ta c≈©ng c√≥ th·ªÉ s·ª≠ d·ª•ng c√°c t·ªáp vƒÉn b·∫£n ƒë·ªÉ hu·∫•n luy·ªán tokenizer c·ªßa m√¨nh nh∆∞ sau (ta t√°i kh·ªüi t·∫°o m√¥ h√¨nh v·ªõi m·ªôt WordPiece r·ªóng):



In [68]:
tokenizer.model = models.WordPiece(unk_token="[UNK]")
tokenizer.train(["wikitext-2.txt"], trainer=trainer)






In [69]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


encoding thu dc l√† m·ªôt encoding g·ªìm t·∫•t c·∫£ c√°c ƒë·∫ßu ra c·∫ßn thi·∫øt  c·ªßa m·ªôt tokenizer trong t·∫•t c·∫£ c√°c th√¥ng s·ªë ƒëa d·∫°ng c·ªßa n√≥: ids, type_ids, tokens, offsets, attention_mask, special_toekns_mask v√† overflowing 

h·∫≠u x·ª≠ l√Ω: post-tokenizer 

ta c·∫ßn th√™m token [CLS] token t·∫°i ƒë·∫ßu v√† [SEP] ·ªü cu·ªëi (ho·∫∑c sau m·ªói c√¢u n·∫øu ta c√≥ c·∫∑p c√¢u). Ch√∫ng ta s·∫Ω s·ª≠ d·ª•ng `TemplateProcessor` ƒë·ªÉ th·ª±c hi·ªán ƒëi·ªÅu n√†y 

In [70]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


ƒê·ªÉ vi·∫øt b·∫£n m·∫´u cho TemplateProcessor, ch√∫ng ta ph·∫£i ch·ªâ ƒë·ªãnh c√°ch x·ª≠ l√Ω m·ªôt c√¢u ƒë∆°n v√† m·ªôt c·∫∑p c√¢u. ƒê·ªëi v·ªõi c·∫£ hai, ch√∫ng t√¥i vi·∫øt c√°c token ƒë·∫∑c bi·ªát mu·ªën s·ª≠ d·ª•ng; c√¢u ƒë·∫ßu ti√™n (ho·∫∑c c√¢u ƒë∆°n) ƒë∆∞·ª£c bi·ªÉu th·ªã b·∫±ng $A, trong khi c√¢u th·ª© hai (n·∫øu token m·ªôt c·∫∑p) ƒë∆∞·ª£c bi·ªÉu th·ªã b·∫±ng $B. ƒê·ªëi v·ªõi m·ªói  lo·∫°i trong s·ªë n√†y (token v√† c√¢u ƒë·∫∑c bi·ªát), ch√∫ng ta c≈©ng ch·ªâ ƒë·ªãnh lo·∫°i token ID t∆∞∆°ng ·ª©ng sau hai d·∫•u ch·∫•m. 

Do ƒë√≥, b·∫£n m·∫´u BERT c·ªï ƒëi·ªÉn ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a nh∆∞ sau:

In [71]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

l∆∞u √Ω r·∫±ng ch√∫ng ta c·∫ßn truy·ªÅn v√†o t·∫•t c·∫£ c√°c IDs c·ªßa c√°c k√Ω t·ª± ƒë·∫∑c bi·ªát, n√™n c√°c tokenizer c√≥ th·ªÉ chuy·ªÉn ƒë·ªïi ch√∫ng th√†nh c√°c c·∫∑p ID.

M·ªôt khi ƒë√£ th√™m v√†o ch√∫ng ta c√≥ th·ªÉ quay l·∫°i v√≠ d·ª• tr∆∞·ªõc ƒë√≥ v√† s·∫Ω nh·∫≠n ƒë∆∞·ª£c 

In [72]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


V√† tr√™n m·ªôt c·∫∑p c√¢u, ch√∫ng ta c√≥ th·ªÉ c√≥ ƒë∆∞·ª£c k·∫øt qu·∫£ sau:

In [73]:
encoding = tokenizer.encode("Let's test this tokenizer.." , "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '..', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


Ch√∫ng ta ƒë√£ g·∫ßn nh∆∞ ho√†n th√†nh vi·ªác x√¢y d·ª±ng tokenizer n√†y t·ª´ ƒë·∫ßu ‚Äî b∆∞·ªõc cu·ªëi c√πng l√† th√™m v√†o m·ªôt tr√¨nh gi·∫£i m√£:

In [74]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

H√£y c≈©ng ki·ªÉm th·ª≠ v·ªõi encoding:

In [75]:
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer.. on a pair of sentences."

Tuy·ªát v·ªùi! Ta c√≥ th·ªÉ l∆∞u tokenizer c·ªßa m√¨nh v√†o trong m·ªôt t·ªáp JSON nh∆∞ d∆∞·ªõi ƒë√¢y:

In [76]:
tokenizer.save("tokenizer.json")

Ta sau ƒë√≥ c√≥ th·ªÉ load l·∫°i t·ªáp n√†y trong ƒë·ªëi t∆∞·ª£ng Tokenizer v·ªõi ph∆∞∆°ng th·ª©c from_file():

In [77]:
new_tokenizer = Tokenizer.from_file("tokenizer.json")

ƒê·ªÉ s·ª≠ d·ª•ng tokenizer n√†y trong ü§ó Transformers, ch√∫ng ta ph·∫£i b·ªçc n√≥ trong PreTrainedTokenizerFast.

Ch√∫ng ta c√≥ th·ªÉ s·ª≠ d·ª•ng l·ªõp chung ho·∫∑c, n·∫øu tokenizer c·ªßa ch√∫ng ta t∆∞∆°ng ·ª©ng v·ªõi m·ªôt m√¥ h√¨nh hi·ªán c√≥, h√£y s·ª≠ d·ª•ng l·ªõp ƒë√≥ (·ªü ƒë√¢y l√† BertTokenizerFast). N·∫øu b·∫°n √°p d·ª•ng b√†i h·ªçc n√†y ƒë·ªÉ x√¢y d·ª±ng m·ªôt tokenizer ho√†n to√†n m·ªõi, b·∫°n s·∫Ω ph·∫£i s·ª≠ d·ª•ng t√πy ch·ªçn ƒë·∫ßu ti√™n.



ƒê·ªÉ b·ªçc tokenizer trong m·ªôt PreTrainedTokenizerFast, ch√∫ng ta c√≥ th·ªÉ chuy·ªÉn tokenizer m√† ch√∫ng ta ƒë√£ x√¢y d·ª±ng d∆∞·ªõi d·∫°ng tokenizer_object ho·∫∑c truy·ªÅn t·ªáp tokenizer ch√∫ng ta ƒë√£ l∆∞u d∆∞·ªõi d·∫°ng tokenizer_file. ƒêi·ªÅu quan tr·ªçng c·∫ßn nh·ªõ l√† ch√∫ng ta ph·∫£i ƒë·∫∑t th·ªß c√¥ng t·∫•t c·∫£ c√°c tokenizer ƒë·∫∑c bi·ªát , v√¨ l·ªõp ƒë√≥ kh√¥ng th·ªÉ suy ra t·ª´ ƒë·ªëi t∆∞·ª£ng tokenizer n√†o l√† tokenizer b·ªã MASK, [CLS], [SEP], v√† [PAD] ...

In [78]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # B·∫°n c√≥ th·ªÉ t·∫£i t·ª´ t·ªáp tokenizer
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

N·∫øu b·∫°n ƒëang s·ª± d·ª•ng m·ªôt l·ªõp tokenizer ƒë·∫∑c bi·ªát (nh∆∞ BertTokenizerFast), b·∫°n ch·ªâ c·∫ßn ch·ªâ ƒë·ªãnh m·ªôt token ƒë·∫∑c bi·∫øt kh√°c so v·ªõi m·∫∑c ƒë·ªãnh (·ªü ƒë√¢y l√† kh√¥ng x√°c ƒë·ªãnh, do c√°c th√†nh ph·∫ßn n√†y ƒë√£ c√≥ t·ª´ tr∆∞·ªõc).

In [79]:
from transformers import BertTokenizerFast

wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

B·∫°n c√≥ th·ªÉ s·ª≠ d·ª•ng tokenizer nh∆∞ b·∫•t k·ª≥ tokenizer n√†o kh√°c c·ªßa hugging face transformers. B·∫°n c√≥ th·ªÉ l∆∞u n√≥ v·ªõi ph∆∞∆°ng th·ª©c save_pretrained(), ho·∫∑c push n√≥ l√™n Hub v·ªõi ph∆∞∆°ng th·ª©c push_to_hub().

Gi·ªù ch√∫ng ta ƒë√£ th·∫•y c√°ch x√¢y d·ª±ng b·ªô WordPiece tokenizer, h√£y l√†m t∆∞∆°ng t·ª± ƒë·ªëi v·ªõi BPE tokenizer. Ch√∫ng ta s·∫Ω ti·∫øn h√†nh nhanh h∆°n m·ªôt ch√∫t v√¨ b·∫°n ƒë√£ bi·∫øt t·∫•t c·∫£ c√°c b∆∞·ªõc v√† ch·ªâ l√†m n·ªïi b·∫≠t nh·ªØng ƒëi·ªÉm kh√°c bi·ªát.

# X√¢y d·ª±ng m·ªôt BPE tokenizer t·ª´ ƒë·∫ßu


Gi·ªù h√£y c≈©ng nhau x√¢y d·ª±ng GPT-2 tokenizer. Gi·ªëng nh∆∞ BERT tokenizer, ch√∫ng ta b·∫Øt ƒë·∫ßu b·∫±ng vi·ªác kh·ªüi t·∫°o Tokenizer v·ªõi m√¥ h√¨nh BPE:

In [80]:
tokenizer = Tokenizer(models.BPE())

c≈©ng gi·ªëng nh∆∞ Bert, ch√∫ng ta c√≥ th·ªÉ kh·ªüi t·∫°o m√¥ h√¨nh n√†y v·ªõi m·ªôt b·ªô t·ª´ v·ª±ng n·∫øu ta ƒë√£ c√≥ (ta s·∫Ω c·∫ßn truy·ªÅn v√†o vocab v√† merges trong tr∆∞·ªùng h·ª£p n√†y), nh∆∞ng v√¨ ta s·∫Ω hu·∫•n luy·ªán t·ª´ ƒë·∫ßu, ch√∫ng ta kh√¥ng c·∫ßn l√†m v·∫≠y. Ta c≈©ng kh√¥ng c·∫ßn ch·ªâ ƒë·ªãnh unk_token v√¨ GPT-2 s·ª≠ d·ª•ng BPE c·∫•p byte, ph∆∞∆°ng ph√°p kh√¥ng c·∫ßn ƒë·∫øn n√≥ 

GPT-2 kh√¥ng s·ª≠ d·ª•ng m·ªôt tr√¨nh chu·∫©n ho√°, n√™n ta c√≥ th·ªÉ b·ªè qua b∆∞·ªõc n√†y v√† ƒëi tr·ª±c ti·∫øp v√†o b∆∞·ªõc pre-tokenization:

In [81]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

Tu·ª≥ ch·ªçn ByteLevel ch√∫ng ta th√™m v√†o ·ªü ƒë√¢y kh√¥ng th√™m d·∫•u c√°ch v√†o ƒë·∫ßu c·ªßa m·ªôt c√¢u (th∆∞·ªùng n√≥ l√† m·∫∑c ƒë·ªãnh). Ta c√≥ th·ªÉ nh√¨n c√°c pre-tokenization t·ª´ v√≠ d·ª• t∆∞∆°ng t·ª± ·ªü tr√™n:

In [82]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('ƒ†test', (5, 10)),
 ('ƒ†pre', (10, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

(nh·∫Øc l·∫°i: ƒ† l√† k√Ω hi·ªáu d·∫•u c√°ch)

Ti·∫øp theo l√† m√¥ h√¨nh m√† ta c·∫ßn hu·∫•n luy·ªán. V·ªõi GPT-2, token ƒë·∫∑c bi·ªát duy nh·∫•t c·∫ßn l√† token k·∫øt th√∫c vƒÉn b·∫£n: 

In [83]:
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






Nh∆∞ v·ªõi WordPieceTrainer, c≈©ng nh∆∞ vocab_size v√† special_tokens, ta c√≥ th·ªÉ ch·ªâ ƒë·ªãnh min_frequency n·∫øu mu·ªën, ho·∫∑c n·∫øu ta c√≥ h·∫≠u t·ªë k·∫øt th√∫c t·ª´ (nh∆∞ </w>), ta c√≥ th·ªÉ thi·∫øt l·∫≠p n√≥ v·ªõi end_of_word_suffix.

***tokenizer n√†y c≈©ng c√≥ th·ªÉ ƒë∆∞·ª£c hu·∫•n luy·ªán tr√™n c√°c t·ªáp vƒÉn b·∫£n:***

In [84]:
tokenizer.model = models.BPE()
tokenizer.train(["wikitext-2.txt"], trainer=trainer)







H√£y c≈©ng xem k·∫øt qu·∫£ tokenize tr√™n m·ªôt vƒÉn b·∫£n m·∫´u:

In [85]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['L', 'et', "'", 's', 'ƒ†test', 'ƒ†this', 'ƒ†to', 'ken', 'izer', '.']


Ta √°p d·ª•ng h·∫≠u x·ª≠ l√Ω c·∫•p byte cho GPT-2 tokenizer nh∆∞ sau:

In [86]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

tu·ª≥ ch·ªçn trim_offsets = False ch·ªâ cho h·∫≠u x·ª≠ l√Ω bi·∫øt r·∫±ng ta c·∫ßn b·ªè m·ªôt s·ªë offset token b·∫Øt ƒë·∫ßu v·ªõi "ƒ†": theo c√°ch n√†y, ƒëi·ªÉm b·∫Øt ƒë·∫ßu c·ªßa ofset s·∫Ω tr·ªè v√†o v√πng kh√¥ng gian ph√≠a tr∆∞·ªõc c·ªßa t·ª´  (v√¨ v√πng kh√¥ng gian n√†y v·ªÅ m·∫∑t k·ªπ thu·∫≠t l√† m·ªôt ph·∫ßn c·ªßa t·ª´). H√£y c√πng nh√¨n xem k·∫øt qu·∫£ v·ªõi chu·ªói vƒÉn b·∫£n  ta vuewaf m√£ ho√° v·ªõi 'ƒ†test' l√† token ·ªü ch·ªâ m·ª•c 4:

In [101]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' test'

In [102]:
print(encoding.tokens)
len(encoding.tokens)
print(encoding.offsets)
count = 0
for (start, end) in encoding.offsets:
    print(sentence[start:end])
    count += 1
    if count == 10:
        break

['L', 'et', "'", 's', 'ƒ†test', 'ƒ†this', 'ƒ†to', 'ken', 'izer', '.']
[(0, 1), (1, 3), (3, 4), (4, 5), (5, 10), (10, 15), (15, 18), (18, 21), (21, 25), (25, 26)]
L
et
'
s
 test
 this
 to
ken
izer
.


cu·ªëi c√πng,  ta th√™m m·ªôt tr√¨nh gi·∫£i m√£ c·∫•p byte:


In [103]:
tokenizer.decoder = decoders.ByteLevel()


v√† ki·ªÉm tra xem n√≥ ho·∫°t ƒë·ªông ƒë√∫ng ch∆∞a 

In [104]:
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

==> n√≥ ƒë√£ reproduce l·∫°i c√¢u ban ƒë·∫ßu n√™n n√≥ ƒë√£ ho·∫°t ƒë·ªông ƒë√∫ng 

Gi·ªù ta ƒë√£ xong r·ªìi, ta c√≥ th·ªÉ l∆∞u tokenizer nh∆∞ tr√™n, v√† bao n√≥ l·∫°i trong PreTrainedTokenizerFast ho·∫∑c GPT2TokenizerFast n·∫øu ta mu·ªën n√≥ trong ü§ó Transformers:

In [108]:
# c√°ch 1 
from transformers import PreTrainedTokenizerFast 
wrapped_tokenizer  = PreTrainedTokenizerFast(
    tokenizer_object = tokenizer,
    bos_token = "<|endoftext|>",
    eos_token = "<|endoftext|>",
)

In [109]:
# c√°ch 2 d√πng lu√¥n GPT2TokenizerFast ƒë∆∞·ª£c x√¢y d·ª±ng s·∫µn th√¨ m√¨nh kjhoong c·∫ßn ƒë·ªãnh nghƒ©a th√™m c√°c tham s·ªë nh∆∞ c√°ch 1
from transformers import GPT2TokenizerFast
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)

# x√¢y d·ª±ng m·ªôt Unigram tokenizer t·ª´ ƒë·∫ßu

H√£y c√πng nhau x√¢y d·ª±ng m·ªôt XLNet tokenizer. C≈©ng gi·ªëng nh∆∞ c√°c tokenizer tr∆∞·ªõc ƒë√≥, ta c√≥ th·ªÉ b·∫Øt ƒë·∫ßu kh·ªüi t·∫°o Tokenizer v·ªõi m·ªôt m√¥ h√¨nh Unigram:

In [110]:
tokenizer = Tokenizer(models.Unigram())

M·ªôt l·∫ßn n·ªØa, ch√∫ng ta c√≥ th·ªÉ kh·ªüi t·∫°o m√¥ h√¨nh n√†y v·ªõi m·ªôt t·ª´ v·ª±ng n·∫øu c√≥.

V·ªõi s·ª± chu·∫©n ho√° n√†y, XLNet s·ª≠ d·ª•ng m·ªôt v√†i ph∆∞∆°ng ph√°p thay th·∫ø (ƒë·∫øn t·ª´ SentencePiece):



In [111]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)

ƒëi·ªÅu n√†y thay th·∫ø "  v√† " b·∫±ng "  (·ªü ƒë√¢y chu·∫©n ho√° d·∫•u m·ªü ngo·∫∑c v√† ƒë√≥ng ngo·∫∑c b·∫±ng 1 d·∫•u k√©p c·ª• th·ªÉ)  v√† thay th·∫ø b·∫•t k·ª≥ chu·ªói n√†o ch·ª©a hai ho·∫∑c nhi·ªÅu h∆°n d·∫•u c√°ch li·ªÅn nhau thnahf m·ªôt d·∫•u duy nh·∫•t, c≈©ng nh∆∞ lo·∫°i b·ªè c√°c d·∫•u c√≥ trong vƒÉn b·∫£n ƒë·ªÉ tokenizer 

pre-tokenizer ƒë∆∞·ª£c s·ª≠ d·ª•ng cho SentencePiece tokenizer l√† Metaspace 

In [112]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

ta c√≥ th·ªÉ nh√¨n v√†o ƒë·∫ßu ra quy tr√¨nh ti·ªÅn tokenizer qua v√≠  d·ª• vƒÉn b·∫£n d∆∞·ªõi ƒë√¢y 

In [113]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("‚ñÅLet's", (0, 5)),
 ('‚ñÅtest', (5, 10)),
 ('‚ñÅthe', (10, 14)),
 ('‚ñÅpre-tokenizer!', (14, 29))]

Ti·∫øp theo l√† m√¥ h√¨nh ta c·∫ßn hu·∫•n luy·ªán. XLNet c√≥ m·ªôt s·ªë token ƒë·∫∑c bi·ªát:



In [117]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)





M·ªôt tham s·ªë v√¥ c√πng quan trong m√† ta kh√¥ng th·ªÉ qu√™n c·ªßa UnigramTrainer l√† unk_token. Ta c√≥ th·ªÉ truy·ªÅn v√†o c√°c tham s·ªë c·ª• th·ªÉ kh√°c t·ªõi thu·∫≠t to√°n Unigram, v√≠ d·ª• shrinking_factor cho c√°c b∆∞·ªõc m√† ta xo√° token (m·∫∑c ƒë·ªãnh l√† 0.75) ho·∫∑c max_piece_length ƒë·ªÉ ch·ªâ ƒë·ªãnh ƒë·ªô d√†i t·ªëi ƒëa c·ªßa m·ªôt token (m·∫∑c ƒë·ªãnh l√† 16).

Tokenizer n√†y c√≥ th·ªÉ ƒë∆∞·ª£c hu·∫•n luy·ªán tr√™n c√°c t·ªáp vƒÉn b·∫£n:

In [115]:
# tokenizer.model = models.Unigram()
# tokenizer.train(["wikitext-2.txt"], trainer=trainer)





In [132]:
def line_by_line_text_generator(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            yield line

# Create the iterator
text_iterator = line_by_line_text_generator("wikitext-2.txt")

# Train the tokenizer
tokenizer.train_from_iterator(text_iterator, trainer=trainer)

TypeError: Tokenizer.train_from_iterator() got an unexpected keyword argument 'unk_token'

In [131]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)
# print(encoding.type_ids)

['‚ñÅLet', "'", 's', '‚ñÅtest', '‚ñÅthis', '‚ñÅto', 'ken', 'izer', '.']


M·ªôt  ƒë·∫∑c ƒëi·ªÉm ƒë·∫∑c bi·ªát c·ªßa XLNet ƒë√≥ l√† n√≥ th√™m <cls> ·ªü cu·ªëi m·ªói c√¢u, v·ªõi ki·ªÉu ID laf 2 (ƒë·ªÉ ph√¢n bi·ªát v·ªõi c√°c token kh√°c). N√≥ ƒë√™m th√™m v√†o ph√≠a b√™n tr√°i gi·ªëng nh∆∞ k·∫øt qu·∫£ ·ªü tr√™n, a c√≥ th·ªÉ x·ª≠ l√Ω t·∫•t c·∫£ c√°c token ƒë·∫∑c bi·ªát v√† c√°c token ki·ªÉu ID v·ªõi c√πng m·ªôt b·∫£n m·∫´u, nh∆∞ BERT, nh∆∞ng ƒë·∫ßu ti√™n ta ph·∫£i l·∫•y c√°c ID c·ªßa token [cls] v√† [sep]:

In [133]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


B·∫£n m·∫´u s·∫Ω tr√¥ng nh∆∞ sau:

In [134]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

V√† ta c√≥ th·ªÉ ki·ªÉm tra xem n√≥ ho·∫°t ƒë·ªông kh√¥ng b·∫±ng c√°ch m√£ ho√° c·∫∑p c√¢u:

In [135]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['‚ñÅLet', "'", 's', '‚ñÅtest', '‚ñÅthis', '‚ñÅto', 'ken', 'izer', '.', '.', '.', '<sep>', '‚ñÅ', 'on', '‚ñÅ', 'a', '‚ñÅpair', '‚ñÅof', '‚ñÅsentence', 's', '!', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


Cu·ªëi c√πng, ta s·∫Ω th√™m tr√¨nh gi·∫£i m√£ Metaspace:

In [136]:
tokenizer.decoder = decoders.Metaspace()

 Ta c√≥ th·ªÉ l∆∞u tokenizer nh∆∞ tr√™n, v√† bao n√≥ l·∫°i trong PreTrainedTokenizerFast ho·∫∑c XLNetTokenizerFast n·∫øu ta mu·ªën n√≥ trong ü§ó Transformers. M·ªôt ƒëi·ªÉm c·∫ßn l∆∞u √Ω l√† khi s·ª≠ d·ª•ng PreTrainedTokenizerFast th√¨ tr√™n ƒë·∫ßu c·ªßa c√°c token ƒë·∫∑c bi·ªát ta c·∫ßn n√≥i cho th∆∞ vi·ªán ü§ó Transformers vi·∫øt ta c·∫ßn ƒë·ªám v√†o ph√≠a b√™n tr√°i:

In [137]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

Ho·∫∑c m·ªôt c√°ch kh√°c

In [139]:
from transformers import XLNetTokenizerFast

wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)