In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
# import pytorch_lightning as pl


from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    AutoTokenizer,
    T5ForTokenClassification,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {label2id[x]: x for x in label2id}
labels = ["Person", "Organization", "Location", "Miscellaneous"]
labels_short = ["PER", "ORG", "LOC", "MISC"]
short2long = {"PER": "Person", "ORG": "Organization", "LOC": "Location", "MISC": "Miscellaneous"}

model_name = "t5-small"
model_checkpoint_path = f"checkpoints/{model_name}-token-clf-conll"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abuboba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from datasets import load_dataset

# dataset = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_train.json'))
# dataset["test"] = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_test.json'))["train"]
# dataset["validation"] = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_dev.json'))["train"]

dataset = load_dataset("conll2003")

In [4]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)

In [5]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['▁EU',
 '▁reject',
 's',
 '▁German',
 '▁call',
 '▁to',
 '▁boycott',
 '▁British',
 '▁lamb',
 '▁',
 '.',
 '</s>']

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")
label_list = list(label2id.keys())
labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p, full=False):
    predictions, labels = p

    if full is False:
        predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    if full:
        return results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [11]:
from transformers import TrainingArguments, Trainer

model = T5ForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id, device_map='cuda'
)
model.model_parallel = False

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir=model_checkpoint_path,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2337,0.144859,0.754533,0.805453,0.779162,0.967057
2,0.1334,0.107219,0.824867,0.860821,0.842461,0.976267
3,0.1055,0.087294,0.858398,0.88758,0.872745,0.980472
4,0.0851,0.079472,0.87111,0.899697,0.885173,0.982594
5,0.0785,0.071966,0.880026,0.906092,0.892869,0.983996
6,0.0721,0.068524,0.890745,0.91518,0.902797,0.985028
7,0.0645,0.065772,0.892711,0.9172,0.90479,0.985398
8,0.061,0.065448,0.896134,0.920565,0.908185,0.985923
9,0.0561,0.065118,0.896873,0.92208,0.909302,0.986118
10,0.0574,0.064277,0.898116,0.922753,0.910268,0.986118


TrainOutput(global_step=17560, training_loss=0.12647301718966023, metrics={'train_runtime': 516.8308, 'train_samples_per_second': 271.675, 'train_steps_per_second': 33.976, 'total_flos': 766201144642806.0, 'train_loss': 0.12647301718966023, 'epoch': 10.0})

In [13]:
pred = []

# model = T5ForTokenClassification.from_pretrained(
#     "t5-token-clf-low", num_labels=len(label2id), id2label=id2label, label2id=label2id, device_map='balanced'
# ).to("cuda")

for item in tokenized_dataset["test"]:
    a = tokenizer(item["tokens"],truncation=True, padding=True,is_split_into_words=True, return_tensors="pt").to("cuda")
    predictions = np.argmax(model(**a).logits.cpu().detach(), axis=2)
    pred.extend(predictions)


In [14]:
dct = compute_metrics((pred, tokenized_dataset["test"]["labels"]), True)

In [15]:
for x in dct:
    print(x, "---", dct[x], sep="\t")

LOC	---	{'precision': 0.8914454277286136, 'recall': 0.9058752997601919, 'f1': 0.8986024382991377, 'number': 1668}
MISC	---	{'precision': 0.7214854111405835, 'recall': 0.7749287749287749, 'f1': 0.7472527472527473, 'number': 702}
ORG	---	{'precision': 0.8185507246376812, 'recall': 0.8500903070439494, 'f1': 0.8340224453632604, 'number': 1661}
PER	---	{'precision': 0.9149315883402737, 'recall': 0.9511440940012369, 'f1': 0.9326864766525167, 'number': 1617}
overall_precision	---	0.8548249359521777
overall_recall	---	0.8861543909348442
overall_f1	---	0.8702077718855952
overall_accuracy	---	0.9765909335630452


In [16]:
with open("t5-small-conll.txt", "w") as f:
    f.write(f"{trainer.state.log_history}")