# NT-500M Human Ref Model Fine Tuning With LoRA

## 1. Installing Dependancies

In [1]:
# Install
!pip install -q biopython transformers datasets huggingface_hub accelerate peft
!apt install git-lfs

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/21

In [3]:
from transformers.utils import send_example_telemetry

send_example_telemetry("nucleotide_transformer_dna_sequence_modeling_with_lora_notebook", framework="pytorch")

## 2. Loading Model 

In [4]:
# Imports
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

2025-09-25 10:00:01.804489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758794402.139852      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758794402.232033      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend()

In [6]:
num_labels_promoter = 37
# Load the model
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels_promoter)
model = model.to(device)

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=1, lora_alpha= 32, lora_dropout=0.1, target_modules= ["query", "value"],
    #modules_to_save=["intermediate"] # modules that are not frozen and updated during the training
)

In [8]:
from peft import get_peft_model

lora_classifier = get_peft_model(model, peft_config) 
lora_classifier.print_trainable_parameters()
lora_classifier.to(device) 

trainable params: 1,809,957 || all params: 482,295,595 || trainable%: 0.3753


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): EsmForSequenceClassification(
      (esm): EsmModel(
        (embeddings): EsmEmbeddings(
          (word_embeddings): Embedding(4105, 1280, padding_idx=1)
          (dropout): Dropout(p=0.0, inplace=False)
          (position_embeddings): Embedding(1002, 1280, padding_idx=1)
        )
        (encoder): EsmEncoder(
          (layer): ModuleList(
            (0-23): 24 x EsmLayer(
              (attention): EsmAttention(
                (self): EsmSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1280, out_features=1, bias=False)
                    )
                    (lora_B): ModuleDict(
   

## 3.Dataset loading and preparation

In [9]:
from datasets import load_dataset, Dataset

# Load the promoter dataset from the InstaDeep Hugging Face ressources
train_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="train",
        streaming= False,
    )
test_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="test",
        streaming= False,
    )
val_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="val",
        streaming= False,
    )

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/618k [00:00<?, ?B/s]

data/val-00000-of-00001.parquet:   0%|          | 0.00/622k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1159 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/144 [00:00<?, ? examples/s]

In [10]:
unique_answers = sorted(set(val_dataset_promoter["answer"]))
len(unique_answers)

27

In [11]:
# Get training data
train_sequences_promoter = train_dataset_promoter['reference_sequence']
train_labels_promoter = train_dataset_promoter['answer']

# Split the dataset into a training and a validation dataset
# train_sequences_promoter, validation_sequences_promoter, train_labels_promoter, validation_labels_promoter = train_test_split(train_sequences_promoter,
#                                                                               train_labels_promoter, test_size=0.05, random_state=42)

validation_sequences_promoter = val_dataset_promoter['reference_sequence']
validation_labels_promoter = val_dataset_promoter['answer']

# Get test data
test_sequences_promoter = test_dataset_promoter['reference_sequence']
test_labels_promoter = test_dataset_promoter['answer']

from sklearn.preprocessing import LabelEncoder

# Collect all labels across splits
all_labels = (
    list(train_labels_promoter) + 
    list(validation_labels_promoter) + 
    list(test_labels_promoter)
)

# Fit the encoder
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Encode each split
train_labels_promoter = label_encoder.transform(train_labels_promoter)
validation_labels_promoter = label_encoder.transform(validation_labels_promoter)
test_labels_promoter = label_encoder.transform(test_labels_promoter)

# Save mappings for later use
label2id = {label: idx for idx, label in enumerate(label_encoder.classes_)}
id2label = {idx: label for label, idx in label2id.items()}


In [13]:
unique_train = np.unique(train_labels_promoter)  
unique_val = np.unique(validation_labels_promoter)
unique_test = np.unique(test_labels_promoter)

print("unique_train min/max:", unique_train.min(), unique_train.max(), "count:", len(unique_train))
print("unique_val  min/max:", unique_val.min(), unique_val.max(), "count:", len(unique_val))
print("unique_test min/max:", unique_test.min(), unique_test.max(), "count:", len(unique_test))


unique_train min/max: 0 36 count: 37
unique_val  min/max: 0 36 count: 27
unique_test min/max: 0 36 count: 29


Let us have a look at the data. If we extract the last sequence of the dataset, we see that it is indeed a promoter, as its label is 1. Furthermore, we can also see that it is a TATA promoter, as the TATA motif is present at the 221th nucleotide of the sequence!

## 4.Tokenizing the datasets

In [15]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")

tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

In [26]:
ds_train_promoter = Dataset.from_dict({"ref": train_sequences_promoter,"var":train_dataset_promoter['variant_sequence'],'labels':train_labels_promoter})
ds_validation_promoter = Dataset.from_dict({"ref": validation_sequences_promoter,"var":val_dataset_promoter['variant_sequence'],'labels':validation_labels_promoter})
ds_test_promoter = Dataset.from_dict({"ref": test_sequences_promoter,"var":test_dataset_promoter['variant_sequence'],'labels':test_labels_promoter})


In [36]:
def tokenize_function(examples):
    model.to("cpu")
    ref_tokens = tokenizer(examples["ref"], return_tensors="pt", truncation=True, padding=True)
    var_tokens = tokenizer(examples["var"], return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        ref_out = model(**ref_tokens).last_hidden_state.mean(dim=1) 
        var_out = model(**var_tokens).last_hidden_state.mean(dim=1)

    outputs = torch.cat([ref_out, var_out], dim=1).cpu().numpy()

    return outputs

In [None]:
# Creating tokenized promoter dataset
tokenized_datasets_train_promoter = ds_train_promoter.map(
    tokenize_function,
    batched=True,
    remove_columns=["ref","var"],
)
tokenized_datasets_validation_promoter = ds_validation_promoter.map(
    tokenize_function,
    batched=True,
    remove_columns=["ref","var"],
)
tokenized_datasets_test_promoter = ds_test_promoter.map(
    tokenize_function,
    batched=True,
    remove_columns=["ref","var"],
)

Map:   0%|          | 0/1159 [00:00<?, ? examples/s]

## 5. Fine-tuning and evaluation

In [None]:
batch_size = 2
model_name='nucleotide-transformer'
args_promoter = TrainingArguments(
    f"{model_name}-finetuned-lora-NucleotideTransformer",
    remove_unused_columns=False,
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps= 1,
    per_device_eval_batch_size= 64,
    num_train_epochs= 2,
    logging_steps= 100,
    load_best_model_at_end=True,  # Keep the best model according to the evaluation
    metric_for_best_model="f1_score",
    label_names=["labels"],
    dataloader_drop_last=True,
    max_steps= 1000
)

In [None]:
def compute_metrics_f1_score(eval_pred):
    """Computes F1 score for binary classification"""
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    references = eval_pred.label_ids
    r={'f1_score': f1_score(references, predictions, average='macro')}
    return r

In [None]:
trainer = Trainer(
    # model.to(device),
    lora_classifier,
    args_promoter,
    train_dataset= tokenized_datasets_train_promoter,
    eval_dataset= tokenized_datasets_validation_promoter,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_f1_score,
)

In [None]:
train_results = trainer.train()

In [None]:
curve_evaluation_f1_score =[[a['step'],a['eval_f1_score']] for a in trainer.state.log_history if 'eval_f1_score' in a.keys()]
eval_f1_score = [c[1] for c in curve_evaluation_f1_score]
steps = [c[0] for c in curve_evaluation_f1_score]

In [None]:
plt.plot(steps, eval_f1_score, 'b', label='Validation F1 score')
plt.title('Validation F1 score for promoter prediction')
plt.xlabel('Number of training steps performed')
plt.ylabel('Validation F1 score')
plt.legend()
plt.show()

In [None]:
# Compute the F1 score on the test dataset :
print(f"F1 score on the test dataset: {trainer.predict(tokenized_datasets_test_promoter).metrics['test_f1_score']}")

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Get raw predictions and labels for test data
preds = trainer.predict(tokenized_datasets_test_promoter)
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=-1)

# Compute metrics individually
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall    = recall_score(y_true, y_pred, average='macro', zero_division=0)
accuracy  = accuracy_score(y_true, y_pred)
f1        = f1_score(y_true, y_pred, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
