In [858]:
!nvidia-smi

Sun Jan 26 21:06:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:47:00.0 Off |                    0 |
| N/A   32C    P0             81W /  400W |     543MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                     

In [859]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [860]:
from codealltag_data_processor_v2025 import CodealltagDataProcessor

In [861]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor.yml'])
cdp_2020 = CodealltagDataProcessor(data_version='20200518', config_path=['codealltag_data_processor.yml'])

In [862]:
import argparse
import random
import numpy as np
import torch
import pytorch_lightning as pl
from transformers import (
    MT5ForConditionalGeneration,
    MT5TokenizerFast,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import Adafactor, AdafactorSchedule

In [863]:
# Set the seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(cdp_2022.get_random_seed())

In [864]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam

        self.model = MT5ForConditionalGeneration.from_pretrained(hparam.model_name_or_path)
        self.tokenizer = MT5TokenizerFast.from_pretrained(hparam.model_name_or_path)
        self.save_hyperparameters()
    
    def is_logger(self):
        return True

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )
        loss = outputs.loss
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
        lr_scheduler = AdafactorSchedule(optimizer)
        return [optimizer], [lr_scheduler]

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparam)
        return DataLoader(train_dataset, batch_size=self.hparam.train_batch_size, drop_last=True, shuffle=True, num_workers=2)

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="dev", args=self.hparam)
        return DataLoader(val_dataset, batch_size=self.hparam.eval_batch_size, num_workers=2)

In [865]:
args_dict = dict(
    model_name_or_path='google/mt5-base',
    tokenizer_name_or_path='google/mt5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=2,
    eval_batch_size=2,
    num_train_epochs=5,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1,
)

In [866]:
sample_size = 9_000
k = 5

In [867]:
dataset = cdp_2022.get_train_dev_test_datasetdict_for_sample_size(cdp_2020, sample_size, k)

In [868]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'Category', 'FilePath', 'FileSize', 'AnnotationFileExists', 'InputType1', 'InputType2', 'OutputType1', 'OutputType2', '__index_level_0__'],
        num_rows: 5760
    })
    dev: Dataset({
        features: ['ID', 'Category', 'FilePath', 'FileSize', 'AnnotationFileExists', 'InputType1', 'InputType2', 'OutputType1', 'OutputType2', '__index_level_0__'],
        num_rows: 1440
    })
    test: Dataset({
        features: ['ID', 'Category', 'FilePath', 'FileSize', 'AnnotationFileExists', 'InputType1', 'InputType2', 'OutputType1', 'OutputType2', '__index_level_0__'],
        num_rows: 1800
    })
})


In [869]:
dataset['train'][0]

{'ID': 4532,
 'Category': 'EVENTS',
 'FilePath': 'CodEAlltag_pXL_EVENTS/6-/62230.txt',
 'FileSize': 176,
 'AnnotationFileExists': True,
 'InputType1': 'Ich O\nauch O\n. O\nWie O\ngut O\ndas O\ndie O\nnicht O\nwissen O\n, O\ndas O\nich O\nder O\nHaupttäter O\nund O\nDrahtzieher O\nbin O\n. O\nHier O\nmeine O\nAdresse O\n: O\nNiklaus B-MALE\nDünnebacke B-FAMILY\nLöscherstraße B-STREET\n1 B-STREETNO\n25985 B-ZIP\nIbersheim B-CITY\n-- O\nTHE O\nT O\n☢ O\n☢ O\nN O\n',
 'InputType2': 'Ich auch . Wie gut das die nicht wissen , das ich der Haupttäter und Drahtzieher bin . Hier meine Adresse : Niklaus Dünnebacke Löscherstraße 1 25985 Ibersheim -- THE T ☢ ☢ N',
 'OutputType1': 'MALE: Niklaus; FAMILY: Dünnebacke; STREET: Löscherstraße; STREETNO: 1; ZIP: 25985; CITY: Ibersheim',
 'OutputType2': 'MALE: Niklaus **Ignaz**; FAMILY: Dünnebacke **Pötter**; STREET: Löscherstraße **Vogtstraße**; STREETNO: 1 **0**; ZIP: 25985 **83984**; CITY: Ibersheim **Bollendorf**',
 '__index_level_0__': 0}

In [870]:
_ = cdp_2022.read_email(dataset['train'][0]['FilePath'], show=True)

../../data/CodEAlltag_pXL_20220513/CodEAlltag_pXL_EVENTS/6-/62230.txt
---------------------------------------------------------------------

Ich auch. Wie gut das die nicht wissen, das ich der Haupttäter und
Drahtzieher bin. Hier meine Adresse:

Niklaus Dünnebacke
Löscherstraße 1
25985 Ibersheim
-- 
THE T☢☢N



In [871]:
dataset['train'][0]['InputType2']

'Ich auch . Wie gut das die nicht wissen , das ich der Haupttäter und Drahtzieher bin . Hier meine Adresse : Niklaus Dünnebacke Löscherstraße 1 25985 Ibersheim -- THE T ☢ ☢ N'

In [872]:
dataset['train'][0]['OutputType2']

'MALE: Niklaus **Ignaz**; FAMILY: Dünnebacke **Pötter**; STREET: Löscherstraße **Vogtstraße**; STREETNO: 1 **0**; ZIP: 25985 **83984**; CITY: Ibersheim **Bollendorf**'

In [873]:
class CodeAlltagDataset(Dataset):
    
    def __init__(self, tokenizer, dataset, type_path, max_len=512):

        self.data = dataset[type_path]
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.tokenizer.max_length = max_len
        self.tokenizer.model_max_length = max_len
        self.inputs = []
        self.targets = []

        self._build()
  
    def __len__(self):
        return len(self.inputs)
  
    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask    = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data[idx]["InputType2"], self.data[idx]["OutputType2"]
      
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_],
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [874]:
tokenizer = MT5TokenizerFast.from_pretrained(args_dict['tokenizer_name_or_path'])

In [875]:
train_dataset = CodeAlltagDataset(tokenizer=tokenizer, dataset=dataset, type_path='train')

In [876]:
sample_data = train_dataset[0]
print(tokenizer.decode(sample_data["source_ids"], skip_special_tokens=False))
print(tokenizer.decode(sample_data["target_ids"], skip_special_tokens=False))

Ich auch. Wie gut das die nicht wissen, das ich der Haupttäter und Drahtzieher bin. Hier meine Adresse : Niklaus Dünnebacke Löscherstraße 1 25985 Ibersheim -- THE T ☢ ☢ N</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [877]:
args = argparse.Namespace(**args_dict)
model = T5FineTuner(args)

  return torch.load(checkpoint_file, map_location="cpu")


In [878]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename="{epoch:02d}-{step:05d}-{val_loss:.4f}", 
    monitor="val_loss", 
    mode="min", 
    save_top_k=1
)
class OverrideEpochStepCallback(pl.callbacks.Callback):
    def __init__(self) -> None:
        super().__init__()

    def on_train_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def on_validation_epoch_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        self._log_step_as_current_epoch(trainer, pl_module)

    def _log_step_as_current_epoch(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        pl_module.log("step", trainer.current_epoch)

In [879]:
tensorboard_logger = pl.loggers.TensorBoardLogger('logs/mT5/NER-PG/'+str(sample_size//1000)+'K', name='k' + str(k))

In [880]:
train_params = dict(
    accumulate_grad_batches=args_dict['gradient_accumulation_steps'],
    devices=args_dict['n_gpu'],
    max_epochs=args_dict['num_train_epochs'],
    precision= '16-mixed' if args_dict['fp_16'] else 32,
    gradient_clip_val=args_dict['max_grad_norm'],
    callbacks=[OverrideEpochStepCallback(), checkpoint_callback],
    accelerator='gpu' if args_dict['n_gpu'] > 0 else 'cpu',
    logger=tensorboard_logger
)

In [881]:
def get_dataset(tokenizer, type_path, args):
    tokenizer.max_length = args.max_seq_length
    tokenizer.model_max_length = args.max_seq_length
    dataset = cdp_2022.get_train_dev_test_datasetdict_for_sample_size(cdp_2020, sample_size, k)
    return CodeAlltagDataset(tokenizer=tokenizer, dataset=dataset, type_path=type_path)

In [882]:
trainer = pl.Trainer(**train_params)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [883]:
trainer.fit(model)

Missing logger folder: logs/mT5/NER-PG/9K/k5
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                        | Params | Mode
-------------------------------------------------------------
0 | model | MT5ForConditionalGeneration | 582 M  | eval
-------------------------------------------------------------
582 M     Trainable params
0         Non-trainable params
582 M     Total params
2,329.605 Total estimated model params size (MB)


Sanity Checking: |                                        | 0/? [00:00<?, ?it/s]

Training: |                                               | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

Validation: |                                             | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [884]:
model_dir = f"logs/mT5/NER-PG/{sample_size//1000}K/k{k}/version_0/checkpoints/"
ckpt_name = next(iter(os.listdir(model_dir)), None)
model_path = os.path.join(model_dir, ckpt_name); model_path

'logs/mT5/NER-PG/9K/k5/version_0/checkpoints/epoch=03-step=00003-val_loss=1.3842.ckpt'

In [885]:
model = T5FineTuner.load_from_checkpoint(model_path)

/home/s81481/pseugc/lib/python3.9/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(checkpoint_file, map_location="cp

In [886]:
test_dataset = CodeAlltagDataset(tokenizer=tokenizer, dataset=dataset, type_path='test')
dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

eval_dataiter = iter(dataloader)

model.model.eval()
model = model.to("cuda")

labels = ['CITY', 'DATE', 'EMAIL', 'FAMILY', 'FEMALE', 'MALE', 'ORG', 
          'PHONE', 'STREET', 'STREETNO', 'UFID', 'URL', 'USER', 'ZIP']

true_labels = []
pred_labels = []

with tqdm(total=len(dataloader), position=0, leave=True) as progress_bar:
    for index in range(0, len(dataloader)):
        batch = next(eval_dataiter)
        input_ids = batch['source_ids'].to('cuda')
        attention_mask = batch['source_mask'].to("cuda")
        
        outs = model.model.generate(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    max_length=512,
                                    temperature=0.8,
                                    do_sample=True,
                                    top_k=100)
        dec = [
            tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
            for ids in outs
        ]
        
        true_labeled_text = dataset["test"][index]["InputType1"]
        
        email_text = cdp_2022.read_email(dataset["test"][index]["FilePath"])[1]
        predicted_annotation_df = cdp_2022.get_annotation_df_with_input_text_and_predicted_text(email_text,
                                                                                                dec[0],
                                                                                                labels)
        pred_labeled_text = cdp_2022.tokenize_with_somajo_and_annotation_df(email_text, predicted_annotation_df)
        
        true_list = cdp_2022.get_token_label_tuples(true_labeled_text)
        pred_list = cdp_2022.get_token_label_tuples(pred_labeled_text)
        
        true_label = [item[1] for item in true_list]
        pred_label = cdp_2022.align_tags(true_list, pred_list)
        
        true_labels.append(true_label)
        pred_labels.append(pred_label)
        
        progress_bar.update(1)

100%|███████████████████████████████████████| 1800/1800 [50:43<00:00,  1.69s/it]


In [887]:
import evaluate
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

seqeval = evaluate.load("seqeval")

report = classification_report(
    y_true=true_labels,
    y_pred=pred_labels,
    mode="strict",
    scheme=IOB2,
    zero_division=0,        
    output_dict=False,
    digits=4
)
print(report)


              precision    recall  f1-score   support

        CITY     0.9658    0.7635    0.8528       554
        DATE     0.9806    0.8306    0.8994       425
       EMAIL     0.9863    0.9220    0.9531       705
      FAMILY     0.9824    0.9001    0.9394      1611
      FEMALE     0.8611    0.8532    0.8571       218
        MALE     0.9853    0.9410    0.9627      2425
         ORG     0.3564    0.5600    0.4356       175
       PHONE     0.9477    0.8164    0.8772       621
      STREET     0.9538    0.8664    0.9080       262
    STREETNO     0.9916    0.9480    0.9693       250
        UFID     0.7640    0.7047    0.7332       193
         URL     0.9800    0.8804    0.9275       945
        USER     0.5357    0.6522    0.5882        46
         ZIP     0.9959    0.9202    0.9565       263

   micro avg     0.9474    0.8804    0.9126      8693
   macro avg     0.8776    0.8256    0.8471      8693
weighted avg     0.9566    0.8804    0.9156      8693



In [888]:
report = classification_report(
    y_true=true_labels,
    y_pred=pred_labels,
    mode="strict",
    scheme=IOB2,
    zero_division=0,        
    output_dict=True,
    digits=4
)
print(report)

{'CITY': {'precision': 0.9657534246575342, 'recall': 0.7635379061371841, 'f1-score': 0.8528225806451614, 'support': 554}, 'DATE': {'precision': 0.9805555555555555, 'recall': 0.8305882352941176, 'f1-score': 0.8993630573248407, 'support': 425}, 'EMAIL': {'precision': 0.9863429438543247, 'recall': 0.9219858156028369, 'f1-score': 0.9530791788856305, 'support': 705}, 'FAMILY': {'precision': 0.9823848238482384, 'recall': 0.9000620732464308, 'f1-score': 0.9394233884029803, 'support': 1611}, 'FEMALE': {'precision': 0.8611111111111112, 'recall': 0.8532110091743119, 'f1-score': 0.8571428571428571, 'support': 218}, 'MALE': {'precision': 0.9853195164075993, 'recall': 0.9410309278350516, 'f1-score': 0.9626661041974267, 'support': 2425}, 'ORG': {'precision': 0.3563636363636364, 'recall': 0.56, 'f1-score': 0.43555555555555553, 'support': 175}, 'PHONE': {'precision': 0.9476635514018692, 'recall': 0.8164251207729468, 'f1-score': 0.8771626297577856, 'support': 621}, 'STREET': {'precision': 0.95378151260

In [889]:
model = model.to("cpu")
del model
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
import gc
gc.collect()

844