In [1]:
%%capture
! pip install bitsandbytes datasets evaluate peft

In [1]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


https://huggingface.co/tattabio

In [2]:
model_name = "tattabio/gLM2_150M"

## Data preprocessing

In [3]:
from datasets import load_dataset

dataset = load_dataset("InstaDeepAI/nucleotide_transformer_downstream_tasks_revised", "promoter_tata", trust_remote_code=True, revision="c8c94743d3d2838b943398ee676247ac2f774122")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from datasets import DatasetDict

train_valid_split = dataset['train'].train_test_split(test_size=0.15, seed=42)

ds = DatasetDict({
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': dataset['test']
})
ds

DatasetDict({
    train: Dataset({
        features: ['sequence', 'name', 'label'],
        num_rows: 4302
    })
    validation: Dataset({
        features: ['sequence', 'name', 'label'],
        num_rows: 760
    })
    test: Dataset({
        features: ['sequence', 'name', 'label'],
        num_rows: 212
    })
})

In [5]:
ds['train'][0]

{'sequence': 'AATAACTTCACCTAAAAACCAAACGGAAGCATTCACAGACAATTCTTAGTGATCATTGGTTTGAACTAACAGAGCTGAACATTCCTTTAGATGGAGCAGTTTCCAAACCCACTTTCTGTAGAATCTGCAAGTGGATATTTGGACTTCTCTGAGGATTTCGTTGGAAACGGGATAAACTTCCCAGAACTAAACGGAAGCATTCTGAGAAACTTCTTTGTGATGTTTGCATTCAACTCACAGAGTTGAACCTTGCTTTCATAGTTCAGCTTTCAAACACTCTTTTTGTAGAATCTGCAAGTG',
 'name': 'chr11:53429000-53429300|0',
 'label': 0}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, revision=revision or "main")

def tokenize(example):
  sequence = '<+>' + example['sequence'].lower()

  return tokenizer(sequence)

In [7]:
tokenized_ds = ds.map(tokenize, remove_columns=['sequence', 'name'])
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds = tokenized_ds.with_format("torch", device=device)

In [8]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4302
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 760
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 212
    })
})

In [9]:
tokenized_ds['train'][0]['input_ids'].shape

torch.Size([301])

In [10]:
tokenized_ds['train'][0]['input_ids']

tensor([33, 29, 29, 30, 29, 29, 31, 30, 30, 31, 29, 31, 31, 30, 29, 29, 29, 29,
        29, 31, 31, 29, 29, 29, 31, 32, 32, 29, 29, 32, 31, 29, 30, 30, 31, 29,
        31, 29, 32, 29, 31, 29, 29, 30, 30, 31, 30, 30, 29, 32, 30, 32, 29, 30,
        31, 29, 30, 30, 32, 32, 30, 30, 30, 32, 29, 29, 31, 30, 29, 29, 31, 29,
        32, 29, 32, 31, 30, 32, 29, 29, 31, 29, 30, 30, 31, 31, 30, 30, 30, 29,
        32, 29, 30, 32, 32, 29, 32, 31, 29, 32, 30, 30, 30, 31, 31, 29, 29, 29,
        31, 31, 31, 29, 31, 30, 30, 30, 31, 30, 32, 30, 29, 32, 29, 29, 30, 31,
        30, 32, 31, 29, 29, 32, 30, 32, 32, 29, 30, 29, 30, 30, 30, 32, 32, 29,
        31, 30, 30, 31, 30, 31, 30, 32, 29, 32, 32, 29, 30, 30, 30, 31, 32, 30,
        30, 32, 32, 29, 29, 29, 31, 32, 32, 32, 29, 30, 29, 29, 29, 31, 30, 30,
        31, 31, 31, 29, 32, 29, 29, 31, 30, 29, 29, 29, 31, 32, 32, 29, 29, 32,
        31, 29, 30, 30, 31, 30, 32, 29, 32, 29, 29, 29, 31, 30, 30, 31, 30, 30,
        30, 32, 30, 32, 29, 30, 32, 30, 

## Modeling

In [11]:
load_kwargs = {
    'pretrained_model_name_or_path': model_name,
    'trust_remote_code': True,
    'torch_dtype': torch.bfloat16,
}

In [12]:
! wget -t 0 https://huggingface.co/tattabio/gLM2_150M/resolve/main/modeling_glm2.py
! wget -t 0 https://huggingface.co/tattabio/gLM2_150M/resolve/main/configuration_glm2.py
! sed -i -e 's/.configuration_glm2/configuration_glm2/g' modeling_glm2.py

In [13]:
import torch
import torch.nn as nn
from transformers.modeling_outputs import (
    BaseModelOutput,
    SequenceClassifierOutput,
)

from typing import Optional, Union, Tuple
from .configuration_glm2 import gLM2Config
from .modeling_glm2 import gLM2Model, gLM2PreTrainedModel

from transformers import PretrainedConfig
from typing import List

class gLM2ClassicationConfig(gLM2Config):
    def __init__(self, num_classes: int = 2, **kwargs):
        super().__init__(**kwargs)

        self.num_classes = num_classes

        self.auto_map['AutoModelForSequenceClassification'] = "extension_glm2.gLM2ForSequenceClassification"

class gLM2ForSequenceClassification(gLM2PreTrainedModel):
    config_class = gLM2ClassicationConfig

    def __init__(self, config: gLM2ClassicationConfig):
        super().__init__(config)

        self.glm2 = gLM2Model(config)

        self.score = nn.Linear(config.dim, config.num_classes, bias=False)

        self.post_init()

    def get_input_embeddings(self):
        return self.glm2.tok_embeddings

    def set_input_embeddings(self, value):
        self.glm2.tok_embeddings = value

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.glm2(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        token_embeddings = outputs[0]

        # use <+> as CLS token
        cls_token = token_embeddings[:, 0, :]

        logits = self.score(cls_token)

        loss = None
        if labels is not None:
            labels = labels.to(logits.device)

            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_classes), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )

In [14]:
model = gLM2ForSequenceClassification.from_pretrained(**load_kwargs)
# TODO: figure out why the classifier weights are not properly initialized at construction
nn.init.normal_(model.score.weight, std=0.02)

Some weights of gLM2ForSequenceClassification were not initialized from the model checkpoint at tattabio/gLM2_150M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameter containing:
tensor([[ 0.0193, -0.0251,  0.0003,  ..., -0.0018, -0.0002,  0.0215],
        [-0.0145, -0.0098,  0.0063,  ...,  0.0053,  0.0131,  0.0057]],
       dtype=torch.bfloat16, requires_grad=True)

Number of parameters in the model:

In [15]:
model.num_parameters()

152434560

Bytes in memory:

In [16]:
print(model.get_memory_footprint())

304873080


In [17]:
model.to(device)

gLM2ForSequenceClassification(
  (glm2): gLM2Model(
    (tok_embeddings): Embedding(37, 640)
    (encoder): TransformerLayers(
      (layers): ModuleList(
        (0-29): 30 x TransformerBlock(
          (attention): Attention(
            (wqkv): Linear(in_features=640, out_features=1920, bias=False)
            (wo): Linear(in_features=640, out_features=640, bias=False)
            (rotary_emb): RotaryEmbedding()
          )
          (feed_forward): FeedForward(
            (w1): Linear(in_features=640, out_features=1792, bias=False)
            (w2): Linear(in_features=1792, out_features=640, bias=False)
            (w3): Linear(in_features=640, out_features=1792, bias=False)
          )
          (attention_norm): RMSNorm()
          (ffn_norm): RMSNorm()
        )
      )
    )
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (score): Linear(in_features=640, out_features=2, bias=False)
)

Sanity check forward pass

In [18]:
model(input_ids=tokenized_ds['train'][:3]['input_ids'], labels=tokenized_ds['train'][:3]['labels'], attention_mask=tokenized_ds['train'][:3]['attention_mask'])

SequenceClassifierOutput(loss=tensor(0.9414, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>), logits=tensor([[ -7.4688,  -9.6250],
        [-12.1875, -15.7500],
        [-16.5000, -13.8750]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<MmBackward0>), hidden_states=None, attentions=None)

Configure parameter-efficient fine-tuning

In [19]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=['wqkv'],
    modules_to_save=['score'],
    lora_dropout=0.5,
    bias="none",
    inference_mode=False,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,230,080 || all params: 153,664,640 || trainable%: 0.8005


Sanity check forward pass of LoRA model

In [20]:
model(input_ids=tokenized_ds['train'][:3]['input_ids'], labels=tokenized_ds['train'][:3]['labels'], attention_mask=tokenized_ds['train'][:3]['attention_mask'])

SequenceClassifierOutput(loss=tensor(0.0615, device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>), logits=tensor([[ -7.2500,  -9.5000],
        [-12.1875, -15.9375]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<MmBackward0>), hidden_states=None, attentions=None)

Define evaluation metrics

In [21]:
import numpy as np
import evaluate

f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels)

Train

In [22]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir="tata_promoter",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=256,
    num_train_epochs=10,
    weight_decay=0.1,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    remove_unused_columns=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,F1
100,2.0541,0.48745,0.783479
200,0.7138,0.55708,0.772152
300,0.4886,0.369943,0.839806
400,0.362,0.280925,0.891892
500,0.2265,0.366166,0.892121
600,0.1688,0.221445,0.942857
700,0.1812,0.418254,0.891041
800,0.1412,0.112928,0.961644
900,0.0953,0.159377,0.959049
1000,0.0787,0.219075,0.954008


TrainOutput(global_step=1350, training_loss=0.342308561846062, metrics={'train_runtime': 361.5204, 'train_samples_per_second': 118.997, 'train_steps_per_second': 3.734, 'total_flos': 1.19369991831552e+16, 'train_loss': 0.342308561846062, 'epoch': 10.0})

Evaluate on test set

In [23]:
trainer.evaluate(tokenized_ds['test'])

{'eval_loss': 0.053944606333971024,
 'eval_f1': 0.9811320754716981,
 'eval_runtime': 0.5779,
 'eval_samples_per_second': 366.814,
 'eval_steps_per_second': 1.73,
 'epoch': 10.0}