In [1]:
#Finding number of classes

categories = set()
with open("train.txt") as file:
    for line in file.readlines():
        categories.add(line.split(" ")[-1].strip())
print(categories)
print(len(categories) - 1)

{'', 'O', 'I-GRP', 'B-CW', 'B-PER', 'B-PROD', 'I-CORP', 'B-LOC', 'I-PER', 'B-GRP', 'B-CORP', 'I-PROD', 'I-CW', 'I-LOC'}
13


In [2]:
#Categorical Data to Numerical 

str_to_int = {
    "O": 0,
    "B-CORP": 1,
    "I-CORP": 2,
    "B-CW": 3,
    "I-CW": 4,
    "B-GRP": 5,
    "I-GRP": 6,
    "B-LOC": 7,
    "I-LOC": 8,
    "B-PER": 9,
    "I-PER": 10,
    "B-PROD": 11,
    "I-PROD": 12,
}

label_list = [
    "O",
    "B-corporation",
    "I-corporation",
    "B-creative-work",
    "I-creative-work",
    "B-group",
    "I-group",
    "B-location",
    "I-location",
    "B-person",
    "I-person",
    "B-product",
    "I-product",
]

In [4]:
!pip install torch transformers 
import torch, pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [5]:
#Constructing dataset for model to train on 

dataset = []
i = 0
with open("train.txt") as file:
    tokens = []
    tags = []
    for line in file.readlines():
        inp = line.split()
        if len(inp) == 0:
            dataset.append({
                "id": i,
                "tags": tags,
                "tokens": tokens
            })
            i += 1
            tags = []
            tokens = []
        else:
            tokens.append(inp[0])
            tags.append(inp[-1])

In [6]:
#Ensuring subwords and special tokens are correctly labelled 

def tokenize_and_align(entry):
    tokenized_entry = tokenizer(entry["tokens"], truncation = True, is_split_into_words = True)
    labels = entry["tags"]
    word_ids = tokenized_entry.word_ids()
    
    stretched_labels = []
    curr_label_index = 0
    
    prev_w = None
    for w in word_ids:
        if w is None:
            stretched_labels.append(-100)
        elif w != prev_w:
            stretched_labels.append(str_to_int[labels[curr_label_index]])
            curr_label_index += 1
        else:
            stretched_labels.append(-100)
        prev_w = w

    tokenized_entry["labels"] = stretched_labels
    return tokenized_entry

In [7]:
#Mappings between categorical form and numeric form of labels 
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

In [8]:
model = AutoModelForTokenClassification.from_pretrained('xlm-roberta-large', num_labels = 13, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-

In [9]:
tokenized_dataset = list(map(tokenize_and_align, dataset))

In [10]:
!pip install evaluate seqeval
import evaluate
seqeval = evaluate.load("seqeval")

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [11]:
#Helps us pad the input
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
#Fine-tuning
training_args = TrainingArguments(
    output_dir="fine_tuned_model-11",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train(resume_from_checkpoint = True)

Loading model from fine_tuned_model-11/checkpoint-2500.
***** Running training *****
  Num examples = 15299
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9570
  Number of trainable parameters = 558854157
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 2500
  Will skip the first 2 epochs then the first 586 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/586 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
3000,0.0978
3500,0.0841
4000,0.0755
4500,0.0576
5000,0.0482
5500,0.042
6000,0.034
6500,0.028
7000,0.0237
7500,0.0158


Saving model checkpoint to fine_tuned_model-11/checkpoint-3000
Configuration saved in fine_tuned_model-11/checkpoint-3000/config.json
Model weights saved in fine_tuned_model-11/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in fine_tuned_model-11/checkpoint-3000/tokenizer_config.json
Special tokens file saved in fine_tuned_model-11/checkpoint-3000/special_tokens_map.json
Saving model checkpoint to fine_tuned_model-11/checkpoint-3500
Configuration saved in fine_tuned_model-11/checkpoint-3500/config.json
Model weights saved in fine_tuned_model-11/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in fine_tuned_model-11/checkpoint-3500/tokenizer_config.json
Special tokens file saved in fine_tuned_model-11/checkpoint-3500/special_tokens_map.json
Saving model checkpoint to fine_tuned_model-11/checkpoint-4000
Configuration saved in fine_tuned_model-11/checkpoint-4000/config.json
Model weights saved in fine_tuned_model-11/checkpoint-4000/pytorch_model.bin
tokenizer c

TrainOutput(global_step=9570, training_loss=0.02870148509398274, metrics={'train_runtime': 5412.8288, 'train_samples_per_second': 28.264, 'train_steps_per_second': 1.768, 'total_flos': 1.4069196751437798e+16, 'train_loss': 0.02870148509398274, 'epoch': 10.0})