# NT-500M Human Ref Model with Liner Probing

## 1. Installing Dependancies

In [29]:
# Install
!pip install -q biopython transformers datasets huggingface_hub accelerate peft 
!apt install git-lfs

In [31]:
from transformers.utils import send_example_telemetry

send_example_telemetry("nucleotide_transformer_dna_sequence_modeling_with_lora_notebook", framework="pytorch")

## 2. Loading Model 

In [32]:
# Imports
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [33]:
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend()

In [34]:
num_labels_promoter = 37
# Load the model
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels_promoter)
model = model.to(device)

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=1, lora_alpha= 32, lora_dropout=0.1, target_modules= ["query", "value"],
    #modules_to_save=["intermediate"] # modules that are not frozen and updated during the training
)

In [36]:
from peft import get_peft_model

lora_classifier = get_peft_model(model, peft_config) # transform our classifier into a peft model
lora_classifier.print_trainable_parameters()
lora_classifier.to(device) # Put the model on the GPU

trainable params: 1,809,957 || all params: 482,295,595 || trainable%: 0.3753


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): EsmForSequenceClassification(
      (esm): EsmModel(
        (embeddings): EsmEmbeddings(
          (word_embeddings): Embedding(4105, 1280, padding_idx=1)
          (dropout): Dropout(p=0.0, inplace=False)
          (position_embeddings): Embedding(1002, 1280, padding_idx=1)
        )
        (encoder): EsmEncoder(
          (layer): ModuleList(
            (0-23): 24 x EsmLayer(
              (attention): EsmAttention(
                (self): EsmSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1280, out_features=1, bias=False)
                    )
                    (lora_B): ModuleDict(
   

## 3.Dataset loading and preparation

In [37]:
from datasets import load_dataset, Dataset

# Load the promoter dataset from the InstaDeep Hugging Face ressources
train_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="train",
        streaming= False,
    )
test_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="test",
        streaming= False,
    )
val_dataset_promoter = load_dataset(
        "wanglab/kegg",
        "default",
        split="val",
        streaming= False,
    )

In [38]:

train_sequences_promoter = [
    ref + var for ref, var in zip(train_dataset_promoter['reference_sequence'],
                                  train_dataset_promoter['variant_sequence'])
]
train_labels_promoter = train_dataset_promoter['answer']

# Validation data
validation_sequences_promoter = [
    ref + var for ref, var in zip(val_dataset_promoter['reference_sequence'],
                                  val_dataset_promoter['variant_sequence'])
]
validation_labels_promoter = val_dataset_promoter['answer']

# Test data
test_sequences_promoter = [
    ref + var for ref, var in zip(test_dataset_promoter['reference_sequence'],
                                  test_dataset_promoter['variant_sequence'])
]
test_labels_promoter = test_dataset_promoter['answer']

from sklearn.preprocessing import LabelEncoder

# Collect all labels across splits
all_labels = (
    list(train_labels_promoter) + 
    list(validation_labels_promoter) + 
    list(test_labels_promoter)
)

# Fit the encoder
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Encode each split
train_labels_promoter = label_encoder.transform(train_labels_promoter)
validation_labels_promoter = label_encoder.transform(validation_labels_promoter)
test_labels_promoter = label_encoder.transform(test_labels_promoter)

# Save mappings for later use
label2id = {label: idx for idx, label in enumerate(label_encoder.classes_)}
id2label = {idx: label for label, idx in label2id.items()}


## 4.Tokenizing the datasets

In [39]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")

In [40]:
# Promoter dataset
ds_train_promoter = Dataset.from_dict({"ref": train_dataset_promoter['reference_sequence'],"var":train_dataset_promoter['variant_sequence'],'labels':train_labels_promoter})
ds_validation_promoter = Dataset.from_dict({"ref":val_dataset_promoter['reference_sequence'],"var":val_dataset_promoter['variant_sequence'],'labels':validation_labels_promoter})
ds_test_promoter = Dataset.from_dict({"ref": test_dataset_promoter['reference_sequence'],"var":test_dataset_promoter['variant_sequence'],'labels':test_labels_promoter})


In [43]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # move model to GPU once
encoder = model.base_model

def tokenize_function(examples):
    # Tokenize and move to GPU
    ref_tokens = tokenizer(examples["ref"], truncation=True, padding=True, return_tensors="pt").to(device)
    var_tokens = tokenizer(examples["var"], truncation=True, padding=True, return_tensors="pt").to(device)

    with torch.no_grad():
        # ref_out = encoder(**ref_tokens).last_hidden_state.mean(dim=1)
        var_out = encoder(**var_tokens).last_hidden_state.mean(dim=1)
    # Concatenate embeddings
    # outputs = torch.cat([ref_out, var_out], dim=1)  # (batch, 2*hidden)

    # Return as numpy for Hugging Face Dataset
    return {"embeddings": var_out.cpu().numpy()}


In [44]:
tokenized_datasets_train_promoter = ds_train_promoter.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    remove_columns=["ref","var"],
)
tokenized_datasets_validation_promoter = ds_validation_promoter.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    remove_columns=["ref","var"],
)
tokenized_datasets_test_promoter = ds_test_promoter.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    remove_columns=["ref","var"],
)

Map:   0%|          | 0/1159 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

In [45]:
tokenized_datasets_train_promoter.save_to_disk("train_embeddings")
tokenized_datasets_validation_promoter.save_to_disk("val_embeddings")
tokenized_datasets_test_promoter.save_to_disk("test_embeddings")

Saving the dataset (0/1 shards):   0%|          | 0/1159 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/146 [00:00<?, ? examples/s]

In [46]:
len(tokenized_datasets_train_promoter['embeddings'][0])

1280

## 5. Training and evaluation

In [47]:
import torch.nn as nn

hidden_dim = 1280  # assuming ref_out and var_out are same size

class DualSequenceClassifier(nn.Module):
    def __init__(self, input_dim=hidden_dim, num_classes=37):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)


In [48]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

train_dataset = EmbeddingDataset(tokenized_datasets_train_promoter["embeddings"], train_labels_promoter)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = EmbeddingDataset(tokenized_datasets_validation_promoter["embeddings"], validation_labels_promoter)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

In [49]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_clf = DualSequenceClassifier().to(device)
optimizer = optim.Adam(model_clf.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 1000

for epoch in range(1, num_epochs+1):

    model_clf.train()
    train_loss = 0
    correct_train = 0
    total_train = 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        logits = model_clf(batch_x)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * batch_x.size(0)
        _, preds = torch.max(logits, dim=1)
        correct_train += (preds == batch_y).sum().item()
        total_train += batch_x.size(0)

    avg_train_loss = train_loss / total_train
    train_acc = correct_train / total_train

    model_clf.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for val_x, val_y in val_loader:
            val_x, val_y = val_x.to(device), val_y.to(device)
            logits = model_clf(val_x)
            loss = criterion(logits, val_y)

            val_loss += loss.item() * val_x.size(0)
            _, preds = torch.max(logits, dim=1)
            correct_val += (preds == val_y).sum().item()
            total_val += val_x.size(0)

    avg_val_loss = val_loss / total_val
    val_acc = correct_val / total_val

    print(f"Epoch {epoch:02d}: "
          f"Train Loss = {avg_train_loss:.4f}, Train Acc = {train_acc:.4f} | "
          f"Val Loss = {avg_val_loss:.4f}, Val Acc = {val_acc:.4f}")


Epoch 01: Train Loss = 2.7102, Train Acc = 0.2623 | Val Loss = 2.3364, Val Acc = 0.3889
Epoch 02: Train Loss = 2.1736, Train Acc = 0.4323 | Val Loss = 1.8810, Val Acc = 0.5000
Epoch 03: Train Loss = 1.7881, Train Acc = 0.5168 | Val Loss = 1.5897, Val Acc = 0.5347
Epoch 04: Train Loss = 1.5390, Train Acc = 0.5643 | Val Loss = 1.3854, Val Acc = 0.6042
Epoch 05: Train Loss = 1.3620, Train Acc = 0.6238 | Val Loss = 1.2018, Val Acc = 0.6667
Epoch 06: Train Loss = 1.1956, Train Acc = 0.6462 | Val Loss = 1.1188, Val Acc = 0.6806
Epoch 07: Train Loss = 1.0971, Train Acc = 0.6773 | Val Loss = 1.0172, Val Acc = 0.7014
Epoch 08: Train Loss = 0.9784, Train Acc = 0.7170 | Val Loss = 0.9016, Val Acc = 0.7639
Epoch 09: Train Loss = 0.9112, Train Acc = 0.7274 | Val Loss = 0.8423, Val Acc = 0.7361
Epoch 10: Train Loss = 0.8481, Train Acc = 0.7541 | Val Loss = 0.7682, Val Acc = 0.7569
Epoch 11: Train Loss = 0.7432, Train Acc = 0.7817 | Val Loss = 0.7473, Val Acc = 0.7708
Epoch 12: Train Loss = 0.7071, T

In [50]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader

# Assume test_embeddings and test_labels are tensors or numpy arrays
test_dataset = EmbeddingDataset(tokenized_datasets_test_promoter["embeddings"], test_labels_promoter)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model_clf.eval()
all_preds = []
all_labels = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        logits = model_clf(batch_x)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="macro")
recall = recall_score(all_labels, all_preds, average="macro")
f1 = f1_score(all_labels, all_preds, average="macro")

print(f"Test Accuracy:  {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall:    {recall:.4f}")
print(f"Test F1-score:  {f1:.4f}")


Test Accuracy:  0.9041
Test Precision: 0.7564
Test Recall:    0.7696
Test F1-score:  0.7499


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
