# Ex5. Named-entity Recognition

##1.Set up

In [None]:
#!pip3 install pyicu
#!pip3 install pycld2
!pip install datasets transformers seqeval evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 6.5 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 47.3 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.0 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 725 kB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 51.3 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 28.3 MB/s 
Collecting responses<0.19
  D

In [None]:
import pandas as pd
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, AutoModelForTokenClassification, TrainingArguments, DataCollatorForTokenClassification
import numpy as np
import evaluate
import torch

### 1.2 The GPU information

In [None]:
# Check if device supports CUDA interface
CUDA = torch.cuda.is_available()
# Make program run on gpu (cuda:0) if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu:0')
torch.cuda.set_device(device)
print('Using device:', device)

Using device: cuda:0


In [None]:
# Check and print information about available GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Nov 30 22:31:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Get GPU name
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-fc2a553e-d51d-de2e-1037-95cabf7b5d02)


In [None]:
# Check Memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


## 2.Generate the dataset

Firstly, check the avaliable languages in the dataset

In [None]:
configs = datasets.get_dataset_config_names("polyglot_ner")
print(configs)

Downloading builder script:   0%|          | 0.00/6.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

['ca', 'de', 'es', 'fi', 'hi', 'id', 'ko', 'ms', 'pl', 'ru', 'sr', 'tl', 'vi', 'ar', 'cs', 'el', 'et', 'fr', 'hr', 'it', 'lt', 'nl', 'pt', 'sk', 'sv', 'tr', 'zh', 'bg', 'da', 'en', 'fa', 'he', 'hu', 'ja', 'lv', 'no', 'ro', 'sl', 'th', 'uk', 'combined']


In this excercise, we choose the Deutsch for the NER task.

In [None]:
deutsch_dataset = datasets.load_dataset('polyglot_ner','de')
print("Length of the dataset:"+str(len(deutsch_dataset['train'])))
deutsch_dataset

Downloading and preparing dataset polyglot_ner/de to /root/.cache/huggingface/datasets/polyglot_ner/de/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1...


Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/547578 [00:00<?, ? examples/s]

Dataset polyglot_ner downloaded and prepared to /root/.cache/huggingface/datasets/polyglot_ner/de/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Length of the dataset:547578


DatasetDict({
    train: Dataset({
        features: ['id', 'lang', 'words', 'ner'],
        num_rows: 547578
    })
})

This cell is for looking the number of labels in this task.

In [None]:
label_set=[]
for i in range(len(deutsch_dataset['train'])):
    for lb in deutsch_dataset['train'][i]['ner']:
        if lb not in label_set:
            label_set.append(lb)
label_set

['O', 'ORG', 'LOC', 'PER']

Change the labels from string type to integer type. 

In [None]:
label_encoding_dict = {'O': 0,'ORG': 1, 'LOC': 2, 'PER': 3}
for i in range(len(label_set)):
    label_set[i] = label_encoding_dict[label_set[i]]

## 3. Generate the model

In this excercise, we choose the ['bert-based-german-cased'](https://huggingface.co/bert-base-german-cased) model for the NER task, for it's downloaded by about 131k times.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, AutoModelForTokenClassification, TrainingArguments

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [None]:
deutsch_dataset['train'][1]

{'id': '1',
 'lang': 'de',
 'words': ['Doch',
  'die',
  'Ruhe',
  'trügt',
  ',',
  'Cold',
  'und',
  'der',
  'Gefangene',
  'werden',
  'erwischt',
  '.'],
 'ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}

In this cell, we transfer the original words to ids and the String labels to integers in order to generate train and test dataset.

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = deutsch_dataset['train'].map(tokenize_and_align_labels, batched=True)

  0%|          | 0/548 [00:00<?, ?ba/s]

Generate the 3 datasets according to the requirement.

In [None]:
train_dataset_1000 = tokenized_datasets.shuffle(seed=42).select(range(1000))
train_dataset_3000 = tokenized_datasets.shuffle(seed=42).select(range(1000,4000))
eval_dataset_2000 = tokenized_datasets.shuffle(seed=42).select(range(4000,6000))



## 4. Model training and evaluation

Here's the first model, use 1000 rows to train and 2000 rows to test. Here we set the evaluation dataset as the test set, so there's no need to use model.predict.

In [None]:
model1 = AutoModelForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(label_set))
args1 = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=2,
    weight_decay=0.00001,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    f1_macro_metric = evaluate.load("f1", average='macro')
    f1_micro_metric = evaluate.load("f1", average='micro')
    average_metric = evaluate.load("accuracy")


    # Remove ignored index (special tokens)
    true_predictions = []
    true_labels = []
    for prediction, label in zip(predictions, labels):
        for (p, l) in zip(prediction, label): 
            if l != -100:
                true_predictions.append(p)
                true_labels.append(l)
    
    

    f1_macro_results = f1_macro_metric.compute(predictions=true_predictions, references=true_labels, average='macro')
    f1_micro_results = f1_micro_metric.compute(predictions=true_predictions, references=true_labels, average='micro')
    accuracy_results = average_metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "f1_macro": f1_macro_results["f1"],
        "f1_micro": f1_micro_results["f1"],
        "accuracy": accuracy_results["accuracy"],
    }
    
trainer1 = Trainer(
    model1,
    args1,
    train_dataset=train_dataset_1000,
    eval_dataset=eval_dataset_2000,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer1.train()

trainer1.evaluate()

Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy
1,No log,0.156678,0.566024,0.934791,0.934791
2,No log,0.165862,0.690491,0.940616,0.940616


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 10


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 10


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 10


{'eval_loss': 0.1658616065979004,
 'eval_f1_macro': 0.6904907851161439,
 'eval_f1_micro': 0.9406158985780366,
 'eval_accuracy': 0.9406158985780366,
 'eval_runtime': 9.0796,
 'eval_samples_per_second': 220.273,
 'eval_steps_per_second': 22.027,
 'epoch': 2.0}

Here's the second model, use 3000 rows to train and 2000 rows to test

In [None]:
model2 = AutoModelForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(label_set))
args2 = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=2,
    weight_decay=0.00001,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
    
trainer2 = Trainer(
    model2,
    args2,
    train_dataset=train_dataset_3000,
    eval_dataset=eval_dataset_2000,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer2.train()

trainer2.evaluate()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

l

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy
1,No log,0.151048,0.623893,0.932714,0.932714
2,0.144300,0.162587,0.715436,0.942351,0.942351


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 10
Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
 

{'eval_loss': 0.1625872552394867,
 'eval_f1_macro': 0.7154364398412536,
 'eval_f1_micro': 0.9423505225286962,
 'eval_accuracy': 0.9423505225286962,
 'eval_runtime': 9.9039,
 'eval_samples_per_second': 201.94,
 'eval_steps_per_second': 20.194,
 'epoch': 2.0}

Here's the third model, use 3000 rows to train and 2000 rows to evaluate and freeze the embedding layer

In [None]:
model3 = AutoModelForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(label_set))
Embeddings = model3.base_model.embeddings
#freeze the parameters
for param in Embeddings.parameters():
    param.requires_grad = False

args3 = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=2,
    weight_decay=0.00001,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer3 = Trainer(
    model3,
    args3,
    train_dataset=train_dataset_3000,
    eval_dataset=eval_dataset_2000,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer3.train()

trainer3.evaluate()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-german-cased/snapshots/702774c02b32a4f360d5fea60ab034d64bf0141c/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

l

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy
1,No log,0.153436,0.634949,0.932264,0.932264
2,0.144800,0.158322,0.712426,0.943464,0.943464


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 10
Saving model checkpoint to test-ner/checkpoint-500
Configuration saved in test-ner/checkpoint-500/config.json
Model weights saved in test-ner/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-ner/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-ner/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner, lang, words. If id, ner, lang, words are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
 

{'eval_loss': 0.1583220213651657,
 'eval_f1_macro': 0.7124261433055654,
 'eval_f1_micro': 0.9434641082747987,
 'eval_accuracy': 0.9434641082747987,
 'eval_runtime': 9.9951,
 'eval_samples_per_second': 200.098,
 'eval_steps_per_second': 20.01,
 'epoch': 2.0}