In [14]:
#downloading bert-base-spanish-wwm-cased
!pip install transformers &> /dev/null
!pip install seqeval &> /dev/null
!pip install datasets &> /dev/null
!pip install transformers sklearn evaluate &> /dev/null
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz &> /dev/null
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt &> /dev/null
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json &> /dev/null
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/. 
!mv vocab.txt pytorch/.

pytorch/
pytorch/pytorch_model.bin


In [15]:
import torch
from transformers import BertForTokenClassification, Trainer, TrainingArguments, AutoTokenizer
import pandas as pd
from datasets import Dataset, load_dataset, load_metric
from sklearn.metrics import f1_score
import numpy as np

#importing spanish dataset (splitted)
train_set_1000 = load_dataset('polyglot_ner',"es", split="train[:1000]")
train_set_3000 = load_dataset('polyglot_ner',"es", split="train[2000:5000]")
test_set = load_dataset('polyglot_ner',"es", split="train[5000:7000]")



In [None]:
train_set_1000

Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 1000
})

In [None]:
datasets = {train_set_1000: "train_set_1000",train_set_3000: "train_set_3000", test_set: "test_set"}

for key in datasets:
  print("----------",datasets[key])
  # number of sentences in the spanish dataset
  print("# of sentences in Spanish train dataset", len(key))

  #checking nans
  df = pd.DataFrame(key).iloc[:,:].sample(frac=1)
  print("# of nan values in dataset: ", df.isna().sum().sum())

  # getting unique ner values
  u = set()
  for r in key['ner']:
    for i in r:
      u.add(i)
  print("unique 'ner' values:", u)
  print('')


---------- train_set_1000
# of sentences in Spanish train dataset 1000
# of nan values in dataset:  0
unique 'ner' values: {'PER', 'LOC', 'O', 'ORG'}

---------- train_set_3000
# of sentences in Spanish train dataset 3000
# of nan values in dataset:  0
unique 'ner' values: {'PER', 'ORG', 'LOC', 'O'}

---------- test_set
# of sentences in Spanish train dataset 2000
# of nan values in dataset:  0
unique 'ner' values: {'ORG', 'LOC', 'PER', 'O'}



In [17]:
# create the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
model = BertForTokenClassification.from_pretrained("pytorch/", num_labels = 4)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--dccuchile--bert-base-spanish-wwm-cased/snapshots/56a7647b957a4230fc3f80dafbe80f2ba9b0de73/config.json
Model config BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31002
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--dccuchile--bert-base-spanish-w

In [None]:
# encoding using bert-base-case tokenizer / splits word into subtokens

class Token:
  def __init__(self,dataset):
      self.__dataset__ = dataset

  def lab_encoding(self,ner):
      if ner == 'PER':
        ner = 0
      elif ner == 'LOC':
        ner = 1
      elif ner == 'ORG':
        ner = 2
      else: 
        ner = 3
      return ner

  # encoding ner values to integers
  def encode_labels(self):
      self.__dataset__  = pd.DataFrame(self.__dataset__)
      encoded_ner = []
      for r in self.__dataset__['ner']:
        row_labels = []
        for i in r:
          label = self.lab_encoding(i)
          row_labels.append(label)
        encoded_ner.append(row_labels)
      self.__dataset__.insert(4, "encoded_ner", encoded_ner, True)
      # converting pandas df back to dataset object
      return Dataset.from_pandas(self.__dataset__)

  def tokenize_and_align_labels(self,data_row):
      tokenized_inputs = tokenizer(data_row["words"], truncation=True,
                                   is_split_into_words=True,
                                   padding='max_length', 
                                   max_length=128)

      labels = []
      for i, label in enumerate(data_row[f"encoded_ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)    
      tokenized_inputs["labels"] = labels
      return tokenized_inputs

  def tokenize_forward(self):
      self.__dataset__ = self.encode_labels()
      self.encoded_dataset = self.__dataset__.map(self.tokenize_and_align_labels,batched=True)
      self.encoded_dataset = self.encoded_dataset.remove_columns(["id","lang", "words", "ner", "encoded_ner"])
      self.encoded_dataset.set_format("torch", columns=["input_ids", "token_type_ids","attention_mask", "labels"])
      return self.encoded_dataset


In [16]:
encoded_1000_train = Token(train_set_1000).tokenize_forward()
encoded_3000_train = Token(train_set_3000).tokenize_forward()
encoded_test = Token(test_set).tokenize_forward()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# trainer
def ner_trainer(train_set,test_set,epochs,train_batch_size, eval_batch_size, freeze_embedings):

  training_args = TrainingArguments(
        num_train_epochs = epochs,
        per_device_train_batch_size = train_batch_size,
        per_device_eval_batch_size = eval_batch_size,
        output_dir='results',
        logging_dir='logs',
        no_cuda=False,  # defaults to false anyway, just to be explicit
      )

  if freeze_embedings == False:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset= train_set,
        eval_dataset = test_set,
      )
    
    trainer.train()
    preds = trainer.predict(test_set)

    return preds

  else:

    # frozing embedding layer setting para.requires_grad to False  
    for name, param in model.named_parameters():
      if 'classifier' not in name: # classifier layer
        param.requires_grad = False
    
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset= train_set,
      eval_dataset = test_set,
    )
  
    trainer.train()
    preds = trainer.predict(test_set)

    return preds




In [None]:
# getting true labels and predictions

from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

# from sklearn.metrics import classification_report

label_names = ['B-PER','B-LOC','B-ORG','B-O']

def get_labels(predictions, labels):
  
    predictions=predictions.argmax(-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, predictions


# Training and testing dataset of 1000 setences

In [None]:
preds = ner_trainer(encoded_1000_train,encoded_test,10, 4, 4, False)   

true_labels, predictions = get_labels(preds.predictions, preds.label_ids)


print(classification_report(true_labels, predictions))
print('f1 micro average: ', f1_score(true_labels, predictions, average='micro'))
print('f1 macro average: ', f1_score(true_labels, predictions, average='macro'))

***** Running training *****
  Num examples = 1000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2500
  Number of trainable parameters = 109263364


Step,Training Loss
500,0.0927
1000,0.0224
1500,0.0044
2000,0.0009
2500,0.0002


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examp

              precision    recall  f1-score   support

         LOC       0.72      0.65      0.68      1091
           O       0.98      0.99      0.98     49687
         ORG       0.77      0.59      0.67       755
         PER       0.61      0.56      0.58       773

   micro avg       0.97      0.97      0.97     52306
   macro avg       0.77      0.69      0.73     52306
weighted avg       0.97      0.97      0.97     52306

f1 micro average:  0.9680533781975299
f1 macro average:  0.7288874617762789



# Training and testing dataset of 3000 setences



In [None]:
preds = ner_trainer(encoded_3000_train,encoded_test,10, 4, 4, False)   

true_labels, predictions = get_labels(preds.predictions, preds.label_ids)


print(classification_report(true_labels, predictions))
print('f1 micro average: ', f1_score(true_labels, predictions, average='micro'))
print('f1 macro average: ', f1_score(true_labels, predictions, average='macro'))

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7500
  Number of trainable parameters = 109263364


Step,Training Loss
500,0.093
1000,0.0652
1500,0.0558
2000,0.0273
2500,0.0231
3000,0.0172
3500,0.0092
4000,0.0078
4500,0.0034
5000,0.0027


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

              precision    recall  f1-score   support

         LOC       0.72      0.76      0.74      1091
           O       0.98      0.99      0.99     49687
         ORG       0.80      0.66      0.72       755
         PER       0.64      0.60      0.62       773

   micro avg       0.97      0.97      0.97     52306
   macro avg       0.78      0.75      0.77     52306
weighted avg       0.97      0.97      0.97     52306

f1 micro average:  0.9713417198791726
f1 macro average:  0.7667602916739646


# Training and testing dataset of 3000 setences with frozen embedings


In [12]:
preds = ner_trainer(encoded_3000_train,encoded_test,10, 4, 4, True)   

true_labels, predictions = get_labels(preds.predictions, preds.label_ids)


print(classification_report(true_labels, predictions))
print('f1 micro average: ', f1_score(true_labels, predictions, average='micro'))
print('f1 macro average: ', f1_score(true_labels, predictions, average='macro'))

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3000
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7500
  Number of trainable parameters = 3076


Step,Training Loss
500,0.0
1000,0.0


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin


Step,Training Loss
500,0.0
1000,0.0
1500,0.0
2000,0.0001
2500,0.0003
3000,0.0002
3500,0.0
4000,0.0
4500,0.0001
5000,0.0001


Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights saved in results/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3500
Configuration saved in results/checkpoint-3500/config.json
Model weights saved in results/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-4000
Configuration saved in results/checkpoint-4000/config.json
Model weights s

              precision    recall  f1-score   support

         LOC       0.72      0.76      0.74      1091
           O       0.98      0.99      0.99     49687
         ORG       0.79      0.66      0.72       755
         PER       0.64      0.61      0.63       773

   micro avg       0.97      0.97      0.97     52306
   macro avg       0.78      0.75      0.77     52306
weighted avg       0.97      0.97      0.97     52306

f1 micro average:  0.9712843650823997
f1 macro average:  0.7673558297119016


In [13]:
#removing logs folder due to memory error
torch.cuda.empty_cache()

torch.cuda.memory_summary(device=None, abbreviated=False)

