In [1]:
import os
import shutil
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel
# Add evaluate import
import evaluate
import wandb
from huggingface_hub import HfFolder
from wandb.integration.sb3 import WandbCallback
import argparse
# Initialize the metric
#metric = load_metric("seqeval")
# Initialize the metric using evaluate
metric = evaluate.load("seqeval")


  from .autonotebook import tqdm as notebook_tqdm
2024-11-07 16:40:05.588322: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 16:40:05.600732: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 16:40:05.615663: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 16:40:05.620121: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 16:40:05.6

In [2]:
# Retrieve HF token from environment and authenticate
hf_token = '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' #Zunaira
data_folder = '../data/IOB/'
model_checkpoint = 'bioformers/bioformer-16L'
model_save_path = '../models/'

        

In [3]:
def cleanup_checkpoints(output_dir, keep_last=True, best_model_dir=None, last_model_dir=None):
    """
    Deletes unnecessary model checkpoints created during training.
    Keeps the best model directory and optionally the last model directory.

    :param output_dir: Base directory where the checkpoints are saved.
    :param keep_last: Whether to keep the last checkpoint.
    :param best_model_dir: Directory of the best model checkpoint.
    :param last_model_dir: Directory of the last model checkpoint.
    """
    for item in os.listdir(output_dir):
        item_path = os.path.join(output_dir, item)
        if os.path.isdir(item_path) and item.startswith("checkpoint"):
            # Check if this path is not the one we want to keep
            if item_path != best_model_dir and (not keep_last or item_path != last_model_dir):
                shutil.rmtree(item_path)



In [4]:
def convert_IOB_transformer(test_list, pattern):
    new_list = []
    sub_list = []
    for i in test_list:

        if i != pattern:
            sub_list.append(i)
        else:
            new_list.append(sub_list)
            sub_list = []

    return new_list


In [5]:
def get_token_ner_tags(df_, split_name, label2id_):
    ner_tag_list_ = df_['ner_tags'].map(label2id_).fillna(
        '#*#*#*#*#*#*#*#*').tolist()  # convert the list to a pandas series temporarily before mapping
    token_list_ = df_['tokens'].tolist()

    token_list = convert_IOB_transformer(test_list=token_list_, pattern='')
    ner_tag_list = convert_IOB_transformer(test_list=ner_tag_list_, pattern='#*#*#*#*#*#*#*#*')

    df = pd.DataFrame(list(zip(token_list, ner_tag_list)),
                      columns=['tokens', 'ner_tags'])

    # df.to_csv(path_+'GP-DS-OG-CD-Santosh/'+split_name+'_formatted.tsv', index=None, sep ='\t', header=None)

    return token_list, ner_tag_list, df


In [6]:
def compute_metrics(p, id2label):
    """
    Computes evaluation metrics and prints a detailed classification report.

    Parameters:
    p (tuple): A tuple containing predictions and labels.
    id2label (dict): A dictionary mapping label IDs to label names.

    Returns:
    dict: A dictionary with precision, recall, f1, and accuracy metrics.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)  # Assuming predictions are logits or probabilities

    # Decode predictions and labels using id2label
    true_predictions = [
        [id2label[pred] for (pred, label) in zip(prediction, label_ids) if label != -100]
        for prediction, label_ids in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[label] for (pred, label) in zip(prediction, label_ids) if label != -100]
        for prediction, label_ids in zip(predictions, labels)
    ]

    # Flatten the lists for classification_report
    flat_predictions = [pred for sublist in true_predictions for pred in sublist]
    flat_labels = [label for sublist in true_labels for label in sublist]

    # Generate classification report
    report = classification_report(flat_labels, flat_predictions, digits=4)

    # Print the classification report to the screen
    print("\nClassification Report:\n")
    print(report)

    # Compute overall metrics using your existing metric (e.g., seqeval for NER)
    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [7]:
def metric_function(p):
    return compute_metrics(p, id2label)

def tokenize_and_align_labels(examples, device):
    task = "ner"
    label_all_tokens = True
    tokenized_inputs = tokenizer(examples["tokens"], max_length=512, truncation=True, padding="max_length", is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    # labels = torch.tensor(labels).to(dtype=torch.int64)
    # tokenized_inputs["labels"] = labels
    # return tokenized_inputs
    labels = torch.tensor(labels).to(dtype=torch.int64).to(device)  # Move labels to the specified device
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



In [8]:
# Load and prepare dataset
train = pd.read_csv(os.path.join(data_folder, 'train_IOB.tsv'), sep='\t', names=['tokens', 'ner_tags'], skip_blank_lines=False, na_filter=False)

dev = pd.read_csv(os.path.join(data_folder, 'dev_IOB.tsv'), sep='\t', names=['tokens', 'ner_tags'], skip_blank_lines=False, na_filter=False)


In [9]:
  # Dataset processing
label_list_ = train['ner_tags'].dropna().unique().tolist()
label_list = [x for x in label_list_ if x]
id2label = {idx: tag for idx, tag in enumerate(label_list)}
label2id = {tag: idx for idx, tag in enumerate(label_list)}


print(id2label)

{0: 'O', 1: 'B-BT', 2: 'I-BT'}


In [10]:
dev_tokens, dev_tags, dev_df = get_token_ner_tags(df_=dev, split_name='dev', label2id_=label2id)
train_tokens, train_tags, train_df = get_token_ner_tags(df_=train, split_name='train', label2id_= label2id)
train_tokens[0]

['Note',
 '：',
 'Pinelliae',
 'Rhizoma',
 'Praeparatum',
 'Cum',
 'Alumine',
 '(',
 'PRPCA',
 '),',
 'Pinelliae',
 'Rhizoma',
 'Praeparatum',
 'Cum',
 'Zingibere',
 'et',
 'Alumine',
 '(',
 'PRPZA',
 '),',
 'and',
 'Pinelliae',
 'Rhizoma',
 'Praeparatum',
 '(',
 'PRP',
 ').']

In [11]:
trds = Dataset.from_pandas(train_df)#, features=features)
vds = Dataset.from_pandas(dev_df)#, features=features)
# tds = Dataset.from_pandas(test_df)#, features=features)

ds = DatasetDict()

ds['train'] = trds
ds['validation'] = vds


ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 523
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 131
    })
})

In [12]:
# Model initialization
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), id2label=id2label, label2id=label2id)

if torch.cuda.is_available():
    device = 'cuda'
    model.to(device)
else:
    device = 'cpu'
    assert torch.cuda.is_available() == True

        

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bioformers/bioformer-16L and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)
    # Apply the tokenize_and_align_labels function to the datasets
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, device), batched=True)


Map: 100%|█████████████████████████████████████████████████████████████████| 523/523 [00:00<00:00, 1601.61 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 131/131 [00:00<00:00, 1848.13 examples/s]


In [14]:
tokenized_datasets['train'][4]

{'tokens': ['Troubleshooting',
  '2',
  '....#',
  'Load',
  'sam',
  '/',
  'bedtoolsmodule',
  'load',
  'bedtools',
  '/',
  '2',
  '.',
  '29',
  '.',
  '0module',
  'load',
  'samtools',
  '/',
  '1',
  '.',
  '16',
  '.',
  '1',
  '#',
  'Convert',
  'BAM',
  'files',
  'BEDPE',
  'files',
  'using',
  'bedtoolsfor',
  'f',
  'in',
  '$(',
  'find',
  '$',
  'fbam_dir',
  '-',
  'maxdepth',
  '1',
  '-',
  'iname',
  '"∗.',
  'bam',
  '"',
  '-',
  'type',
  'f',
  ')',
  'do',
  'id',
  '=$(',
  'basename',
  '-',
  'a',
  '-',
  's',
  'f1',
  '.',
  'bam',
  '$',
  'f',
  '|',
  'cut',
  '-',
  'd',
  '"',
  '_',
  '"',
  '-',
  'f',
  '1',
  ')',
  '#',
  'Files',
  'must',
  'be',
  'sorted',
  'bam',
  'before',
  'converting',
  'to',
  'BEDPE',
  'samtools',
  'sort',
  '-',
  'n',
  '$',
  'f',
  '|',
  '\\',
  '#',
  'Convert',
  'to',
  'BEDPE',
  'file',
  'bedtools',
  'bamtobed',
  '-',
  'i',
  '-',
  '-',
  'bedpe',
  '|\\',
  '#',
  'Calculate',
  'insert',
  'si

In [15]:
tokenized_datasets['validation'][4]

{'tokens': ['Reference',
  'literature',
  'was',
  'used',
  'to',
  'find',
  'marker',
  'genes',
  'to',
  'manually',
  'annotate',
  'different',
  'cell',
  'clusters',
  ',',
  'with',
  'the',
  'assistance',
  'of',
  'the',
  'Python',
  'package',
  'MetaTime',
  '(',
  'Supplementary',
  'Figures',
  'S1G',
  ',',
  'H',
  ';',
  'Supplementary',
  'Table',
  'S2',
  ').'],
 'ner_tags': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'input_ids': [101,
  22745,
  3484,
  1491,
  1822,
  1446,
  4723,
  4667,
  2262,
  1446,
  17072,
  8280,
  1508,
  1925,
  1547,
  6141,
  117,
  1461,
  1425,
  12982,
  1431,
  1425,
  16748,
  1534,
  1419,
  14912,
  17027,
  23311,
  7169,
  113,
  4847,
  7249,
  4692,
  1149,
  117,
  145,
  132,
  4847,
  2561,
  6301,
  114,
  119,
  102,
  0,

In [16]:
training_args = TrainingArguments(
            output_dir=model_save_path,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=1e-5,
            lr_scheduler_type='cosine',
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            logging_dir='./logs',
    )


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=metric_function  # Define your compute_metrics function
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
# Train the model
trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtsantosh7[0m ([33mebi_literature[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.126719,0.811538,0.508434,0.625185,0.963971
2,No log,0.091085,0.800493,0.783133,0.791717,0.977061
3,No log,0.083884,0.798526,0.783133,0.790754,0.978097
4,No log,0.076967,0.788372,0.816867,0.802367,0.978745
5,No log,0.077798,0.795724,0.807229,0.801435,0.979134
6,No log,0.078322,0.784897,0.826506,0.805164,0.979134
7,No log,0.079186,0.772829,0.836145,0.803241,0.978875
8,0.090700,0.078481,0.799065,0.824096,0.811388,0.980171
9,0.090700,0.079295,0.793503,0.824096,0.808511,0.979782
10,0.090700,0.079268,0.798595,0.821687,0.809976,0.980041



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8231    0.5157    0.6341       415
        I-BT     0.0000    0.0000    0.0000        33
           O     0.9689    0.9939    0.9813      7268

    accuracy                         0.9640      7716
   macro avg     0.5973    0.5032    0.5384      7716
weighted avg     0.9569    0.9640    0.9584      7716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8202    0.8024    0.8112       415
        I-BT     0.0000    0.0000    0.0000        33
           O     0.9858    0.9915    0.9886      7268

    accuracy                         0.9771      7716
   macro avg     0.6020    0.5980    0.5999      7716
weighted avg     0.9727    0.9771    0.9748      7716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8231    0.8072    0.8151       415
        I-BT     0.0000    0.0000    0.0000        33
           O     0.9867    0.9923    0.9895      7268

    accuracy                         0.9781      7716
   macro avg     0.6033    0.5998    0.6015      7716
weighted avg     0.9737    0.9781    0.9759      7716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8116    0.8410    0.8260       415
        I-BT     0.0000    0.0000    0.0000        33
           O     0.9886    0.9911    0.9898      7268

    accuracy                         0.9787      7716
   macro avg     0.6001    0.6107    0.6053      7716
weighted avg     0.9749    0.9787    0.9768      7716



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8195    0.8313    0.8254       415
        I-BT     1.0000    0.0606    0.1143        33
           O     0.9883    0.9917    0.9900      7268

    accuracy                         0.9791      7716
   macro avg     0.9359    0.6279    0.6432      7716
weighted avg     0.9793    0.9791    0.9774      7716


Classification Report:

              precision    recall  f1-score   support

        B-BT     0.8096    0.8506    0.8296       415
        I-BT     1.0000    0.0303    0.0588        33
           O     0.9893    0.9908    0.9900      7268

    accuracy                         0.9791      7716
   macro avg     0.9330    0.6239    0.6262      7716
weighted avg     0.9797    0.9791    0.9774      7716


Classification Report:

              precision    recall  f1-score   support

        B-BT     0.7951    0.8602    0.8264       415
        I-BT     1.0000    0.0606    0.1143        33


TrainOutput(global_step=660, training_loss=0.07431029153592658, metrics={'train_runtime': 102.4892, 'train_samples_per_second': 51.03, 'train_steps_per_second': 6.44, 'total_flos': 456183413176320.0, 'train_loss': 0.07431029153592658, 'epoch': 10.0})

In [34]:
from transformers import pipeline

model_path = '../models/checkpoint-660'
# Create the pipeline with an aggregation strategy
classifier = pipeline("ner", model=model_path, tokenizer=tokenizer, aggregation_strategy="max")



In [47]:
%%time
text = ['The 445 identified proteins were searched against the SubtiWiki, Jalview and UniProt B. A total of 316 sequences of PSEN1 were recovered from different vertebrates, and a multiple sequence alignment (MSA) was constructed using Jalview.']
xx = classifier(text)



CPU times: user 317 ms, sys: 0 ns, total: 317 ms
Wall time: 32.1 ms


In [48]:
xx

[[{'entity_group': 'BT',
   'score': 0.98012495,
   'word': 'SubtiWiki',
   'start': 54,
   'end': 63},
  {'entity_group': 'BT',
   'score': 0.9420018,
   'word': 'Jalview',
   'start': 227,
   'end': 234}]]