In [None]:
!pip install transformers &>> /dev/null

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import torch
import numpy as np
from torch.utils import data

import nltk
from nltk.probability import LaplaceProbDist, WittenBellProbDist
from nltk.probability import LidstoneProbDist, SimpleGoodTuringProbDist
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag import UnigramTagger

nltk.download("treebank")
nltk.download("punkt_tab")

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tagged_sents = nltk.corpus.treebank.tagged_sents()

datasize = len(tagged_sents)
trainsize = int(0.9*datasize)

train_data = tagged_sents[:trainsize]
test_data = tagged_sents[trainsize:]

Most Frequency Class

In [None]:
Utagger = UnigramTagger(train_data)
print("Unigram Tagger accuracy:", Utagger.accuracy(test_data))

Unigram Tagger accuracy: 0.8627989821882952


In [None]:
Hidden Markov Model

In [None]:
HMMtrainer = HiddenMarkovModelTrainer()
HMMtagger = HMMtrainer.train_supervised(train_data, estimator=LaplaceProbDist)

print("HMM accuracy:", HMMtagger.accuracy(test_data))

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])


HMM accuracy: 0.9255979643765904


Chuẩn bị train data cho BERT

In [None]:
def preprocess(data, tokenizer, tagdic):
  results = []
  for i in range(len(data)):
    ids = tokenizer.convert_tokens_to_ids(['[CLS]'])
    lbs = [-100]
    for w, t in data[i]:
      id = tokenizer.encode(w, truncation=True, add_special_tokens=False)
      ids.append(id[0])
      lbs.append(tagdic[t])
      if len(id) > 1:
        ids.extend(id[1:])
        lbs.extend([-100] * (len(id) - 1))
    ids.append(tokenizer.convert_tokens_to_ids('[SEP]'))
    lbs.append(-100)
    results.append({"id":i, "input_ids": ids, "labels": lbs})
  return results


tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))
tags = ["<pad>"] + tags

taglookup = {tag:idx for idx, tag in enumerate(tags)}
idxlookup = {idx:tag for idx, tag in enumerate(tags)}

train = preprocess(train_data, tokenizer, taglookup)
test = preprocess(test_data, tokenizer, taglookup)



Sử dụng lớp AutoModelForTokenClassification để lưu trữ và cập nhật tham số cho mô hình gán nhãn từ loại.

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "google-bert/bert-base-uncased", num_labels=len(idxlookup), id2label=idxlookup, label2id=taglookup
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
training_args = TrainingArguments(
    #output_dir="thư mục lưu model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    #per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    #eval_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True,
    #push_to_hub=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    #eval_dataset=test,
    processing_class=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=442, training_loss=0.5472343410302072, metrics={'train_runtime': 111.8151, 'train_samples_per_second': 62.997, 'train_steps_per_second': 3.953, 'total_flos': 261094689369768.0, 'train_loss': 0.5472343410302072, 'epoch': 2.0})

Sử dụng mô hình để dự đoán

In [None]:
tagger = pipeline("ner", model=model, tokenizer=tokenizer)

sent = "You should probably train this model on a down-stream task to be able to use it"


Device set to use cuda:0


In [None]:
tagger(sent)

[{'entity': 'PRP',
  'score': np.float32(0.9557772),
  'index': 1,
  'word': 'you',
  'start': 0,
  'end': 3},
 {'entity': 'MD',
  'score': np.float32(0.8927413),
  'index': 2,
  'word': 'should',
  'start': 4,
  'end': 10},
 {'entity': 'RB',
  'score': np.float32(0.92568564),
  'index': 3,
  'word': 'probably',
  'start': 11,
  'end': 19},
 {'entity': 'VB',
  'score': np.float32(0.97767913),
  'index': 4,
  'word': 'train',
  'start': 20,
  'end': 25},
 {'entity': 'DT',
  'score': np.float32(0.98473746),
  'index': 5,
  'word': 'this',
  'start': 26,
  'end': 30},
 {'entity': 'NN',
  'score': np.float32(0.97969943),
  'index': 6,
  'word': 'model',
  'start': 31,
  'end': 36},
 {'entity': 'IN',
  'score': np.float32(0.98651624),
  'index': 7,
  'word': 'on',
  'start': 37,
  'end': 39},
 {'entity': 'DT',
  'score': np.float32(0.98749137),
  'index': 8,
  'word': 'a',
  'start': 40,
  'end': 41},
 {'entity': 'JJ',
  'score': np.float32(0.88165456),
  'index': 9,
  'word': 'down',
  'st