In [1]:
import os
import shutil
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, TrainingArguments, Trainer, pipeline
from sklearn.metrics import classification_report
from datasets import Dataset, DatasetDict
import wandb
# from wandb.integration.sb3 import WandbCallback

# Add evaluate import
import evaluate
# Initialize the metric using evaluate
metric = evaluate.load("seqeval")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Retrieve HF token from environment and authenticate
hf_token = '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' #Zunaira
data_checkpoint = '../data/IOB/'
model_checkpoint = 'bioformers/bioformer-16L'
model_save_path = '../models/'


In [3]:
def cleanup_checkpoints(output_dir, keep_last=True, best_model_dir=None, last_model_dir=None):
    """
    Deletes unnecessary model checkpoints created during training.
    Keeps the best model directory and optionally the last model directory.

    :param output_dir: Base directory where the checkpoints are saved.
    :param keep_last: Whether to keep the last checkpoint.
    :param best_model_dir: Directory of the best model checkpoint.
    :param last_model_dir: Directory of the last model checkpoint.
    """
    for item in os.listdir(output_dir):
        item_path = os.path.join(output_dir, item)
        if os.path.isdir(item_path) and item.startswith("checkpoint"):
            # Check if this path is not the one we want to keep
            if item_path != best_model_dir and (not keep_last or item_path != last_model_dir):
                shutil.rmtree(item_path)



In [4]:
def convert_IOB_transformer(test_list, pattern):
    new_list = []
    sub_list = []
    for i in test_list:

        if i != pattern:
            sub_list.append(i)
        else:
            new_list.append(sub_list)
            sub_list = []

    return new_list


In [5]:
def get_token_ner_tags(df_, split_name, label2id_):
    ner_tag_list_ = df_['ner_tags'].map(label2id_).fillna(
        '#*#*#*#*#*#*#*#*').tolist()  # convert the list to a pandas series temporarily before mapping
    token_list_ = df_['tokens'].tolist()

    token_list = convert_IOB_transformer(test_list=token_list_, pattern='')
    ner_tag_list = convert_IOB_transformer(test_list=ner_tag_list_, pattern='#*#*#*#*#*#*#*#*')

    df = pd.DataFrame(list(zip(token_list, ner_tag_list)),
                      columns=['tokens', 'ner_tags'])

    # df.to_csv(path_+'GP-DS-OG-CD-Santosh/'+split_name+'_formatted.tsv', index=None, sep ='\t', header=None)

    return token_list, ner_tag_list, df


In [6]:
def compute_metrics(p, id2label):
    """
    Computes evaluation metrics and prints a detailed classification report.

    Parameters:
    p (tuple): A tuple containing predictions and labels.
    id2label (dict): A dictionary mapping label IDs to label names.

    Returns:
    dict: A dictionary with precision, recall, f1, and accuracy metrics.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)  # Assuming predictions are logits or probabilities

    # Decode predictions and labels using id2label
    true_predictions = [
        [id2label[pred] for (pred, label) in zip(prediction, label_ids) if label != -100]
        for prediction, label_ids in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[label] for (pred, label) in zip(prediction, label_ids) if label != -100]
        for prediction, label_ids in zip(predictions, labels)
    ]

    # Flatten the lists for classification_report
    flat_predictions = [pred for sublist in true_predictions for pred in sublist]
    flat_labels = [label for sublist in true_labels for label in sublist]

    # Generate classification report
    report = classification_report(flat_labels, flat_predictions, digits=4, zero_division=1)

    # Print the classification report to the screen
    print("\nClassification Report:\n")
    print(report)

    # Compute overall metrics using your existing metric (e.g., seqeval for NER)
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [7]:
def metric_function(p):
    return compute_metrics(p, id2label)

def tokenize_and_align_labels(examples, device):
    task = "ner"
    label_all_tokens = True
    tokenized_inputs = tokenizer(examples["tokens"], max_length=512, truncation=True, padding="max_length", is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    labels = torch.tensor(labels).to(dtype=torch.int64).to(device)  # Move labels to the specified device
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [8]:
# Load and prepare dataset
train = pd.read_csv(os.path.join(data_checkpoint, 'train_IOB.tsv'), sep='\t', names=['tokens', 'ner_tags'], skip_blank_lines=False, na_filter=False)
dev = pd.read_csv(os.path.join(data_checkpoint, 'dev_IOB.tsv'), sep='\t', names=['tokens', 'ner_tags'], skip_blank_lines=False, na_filter=False)

In [9]:
# Dataset processing
label_list_ = train['ner_tags'].dropna().unique().tolist()
label_list = [x for x in label_list_ if x]
id2label = {idx: tag for idx, tag in enumerate(label_list)}
label2id = {tag: idx for idx, tag in enumerate(label_list)}

print(id2label)

{0: 'O', 1: 'B-BT', 2: 'I-BT'}


In [10]:
dev_tokens, dev_tags, dev_df = get_token_ner_tags(df_=dev, split_name='dev', label2id_=label2id)
train_tokens, train_tags, train_df = get_token_ner_tags(df_=train, split_name='train', label2id_= label2id)
train_tokens[0]

['Glyco',
 '@',
 'Expasy',
 '[',
 '26',
 ']',
 'uses',
 'a',
 'combination',
 'of',
 'text',
 'mining',
 'tools',
 'and',
 'manual',
 'opportunistic',
 'selection',
 'to',
 'identify',
 'sources',
 '.']

In [11]:
trds = Dataset.from_pandas(train_df)#, features=features)
vds = Dataset.from_pandas(dev_df)#, features=features)
# tds = Dataset.from_pandas(test_df)#, features=features)

ds = DatasetDict()

ds['train'] = trds
ds['validation'] = vds


In [12]:
# Model initialization
# Increase dropout as the model is overfitting our small dataset
hidden_droput_prob = 0.2
attention_probs_dropout_prob = 0.2
config = BertConfig.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id,  attn_implementation="sdpa")
config.hidden_dropout_prob = hidden_droput_prob
config.attention_probs_dropout_prob = attention_probs_dropout_prob

model = BertForTokenClassification.from_pretrained(model_checkpoint, config=config)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bioformers/bioformer-16L and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
if torch.cuda.is_available():
    device = 'cuda'
    model.to(device)
else:
    device = 'cpu'
    # assert torch.cuda.is_available() == True

In [14]:
# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_checkpoint, strip_accents=True, lowercase=False)

# Apply the tokenize_and_align_labels function to the datasets
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, device), batched=True)

print({k: v[:10] for k, v in tokenized_datasets['train'][4].items()})
print({k: v[:10] for k, v in tokenized_datasets['validation'][4].items()})

Map: 100%|██████████| 2244/2244 [00:00<00:00, 3605.16 examples/s]
Map: 100%|██████████| 560/560 [00:00<00:00, 4415.19 examples/s]

{'tokens': ['CDBProm', "'", 's', 'predictor', 'identified', '24', '313', '419', 'promoter', 'sequences'], 'ner_tags': [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'input_ids': [101, 2247, 3721, 1523, 112, 188, 6983, 2326, 2524, 28830], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 1, 1, 1, 0, 0, 0, 0, 0, 0]}
{'tokens': ['Different', 'tools', 'have', 'been', 'introduced', 'for', 'this', 'task', ',', 'including'], 'ner_tags': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'input_ids': [101, 10984, 5457, 1641, 1723, 6035, 1471, 1603, 3603, 117], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0]}





In [None]:
batch_size = 4
training_args = TrainingArguments(
            output_dir=model_save_path,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=1e-5,
            lr_scheduler_type='linear',
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=15,
            warmup_ratio=0.1,
            weight_decay=0.01,
            gradient_accumulation_steps=2,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            bf16=True,
            logging_dir='./logs',)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    compute_metrics=metric_function,  # Define your compute_metrics function
)


In [17]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mafanasyeva-tess[0m ([33mafanasyeva-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.116317,0.582197,0.666393,0.621456,0.960057
2,0.361200,0.106067,0.580529,0.793755,0.6706,0.962208
3,0.361200,0.10465,0.598592,0.838127,0.698391,0.963667
4,0.084300,0.105406,0.62015,0.814297,0.704085,0.96528
5,0.084300,0.110565,0.616759,0.822514,0.70493,0.964243
6,0.056600,0.105999,0.647449,0.802794,0.716801,0.968007
7,0.056600,0.106331,0.676144,0.80115,0.733358,0.969889
8,0.042000,0.110287,0.695931,0.80115,0.744843,0.971694
9,0.033800,0.118143,0.661397,0.824979,0.734186,0.968775
10,0.033800,0.117107,0.675289,0.815119,0.738645,0.969774



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.5915    0.6771    0.6314      1217
        I-BT     1.0000    0.0000    0.0000       114
           O     0.9809    0.9784    0.9797     24706

    accuracy                         0.9601     26037
   macro avg     0.8575    0.5518    0.5370     26037
weighted avg     0.9628    0.9601    0.9591     26037


Classification Report:

              precision    recall  f1-score   support

        B-BT     0.5922    0.8077    0.6834      1217
        I-BT     0.5758    0.1667    0.2585       114
           O     0.9880    0.9735    0.9807     24706

    accuracy                         0.9622     26037
   macro avg     0.7186    0.6493    0.6408     26037
weighted avg     0.9677    0.9622    0.9636     26037






Classification Report:

              precision    recall  f1-score   support

        B-BT     0.6110    0.8480    0.7103      1217
        I-BT     0.4510    0.4035    0.4259       114
           O     0.9904    0.9720    0.9811     24706

    accuracy                         0.9637     26037
   macro avg     0.6841    0.7411    0.7058     26037
weighted avg     0.9703    0.9637    0.9660     26037


Classification Report:

              precision    recall  f1-score   support

        B-BT     0.6314    0.8233    0.7147      1217
        I-BT     0.5000    0.5614    0.5289       114
           O     0.9895    0.9741    0.9818     24706

    accuracy                         0.9653     26037
   macro avg     0.7070    0.7863    0.7418     26037
weighted avg     0.9706    0.9653    0.9673     26037


Classification Report:

              precision    recall  f1-score   support

        B-BT     0.6382    0.8365    0.7240      1217
        I-BT     0.3799    0.7632    0.5073       114


TrainOutput(global_step=4200, training_loss=0.07866793802806309, metrics={'train_runtime': 882.2852, 'train_samples_per_second': 38.151, 'train_steps_per_second': 4.76, 'total_flos': 2925853998391296.0, 'train_loss': 0.07866793802806309, 'epoch': 14.94830659536542})

In [18]:
# Get the last created checkpoint in the directory
def get_last_created_checkpoint(directory):
    folders = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    if not folders:
        return None
    latest_folder = max(folders, key=os.path.getctime)
    return latest_folder

last_created_checkpoint_path = get_last_created_checkpoint('../models')

classifier = pipeline("ner", model=last_created_checkpoint_path, tokenizer=tokenizer, aggregation_strategy="max")


Device set to use cuda:0


In [19]:
from ipymarkup import show_span_box_markup

text =  ['Comet is written in C++ and uses POSIX threads for Linux and Windows compatible multithreading.', 'It is generally agreed that comets, in order to incorporate the ices needed to produce the observed outgassing, must have formed outside the water-ice line, with some of them having formed as far out as beyond the CO-ice line.','To address these needs, we report the COmposable Mammalian Elements of Transcription (COMET)—an ensemble of engineered promoters and modular ZF-TFs with tunable properties. We incorporate into COMET a panel of 19 TFs that were originally developed in yeast']

for t in text:
    classified_text = classifier(t)
    annotations_char_spans = [(item['start'], item['end'], item['entity_group']) for item in classified_text]
    show_span_box_markup(t, annotations_char_spans)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
