In [1]:
! pip install transformers evaluate datasets requests pandas scikit-learn


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import h5py
import numpy as np
import pandas as pd
import os
import torch

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [3]:
print("GPU name:", torch.cuda.get_device_name(0))


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [5]:
# select esm2 model
esm2 = "facebook/esm2_t33_650M_UR50D"

# set tokeniser
tokeniser = AutoTokenizer.from_pretrained(esm2)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]



special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# select files
from google.colab import drive
drive.mount('/content/drive')


KeyboardInterrupt: 

In [7]:
# data setup
df_train = pd.read_csv("train_trimmed.csv")

print(f'Train DF length: {len(df_train)}')
df_train.head()


FileNotFoundError: [Errno 2] No such file or directory: 'train_trimmed.csv'

In [9]:
# testing for longest sequence in train.csv:
# Compute string lengths
df_train["sequence_lengths"] = df_train["sequence"].astype(str).str.len()

# sort in new df by seq length
df_sorted = df_train.sort_values("sequence_lengths", ascending=False)

df_sorted["sequence_lengths"].head(25)


5598     34350
11943    18562
2346     18141
10752    13100
8148      8886
4725      8545
7632      8081
9759      7570
193       7388
5742      7158
4135      6548
4883      6298
11577     5634
12188     5596
10881     5495
192       5430
9760      5379
4678      5327
12447     5207
13080     5147
3047      5146
4645      5100
7004      5088
642       5038
628       5037
Name: sequence_lengths, dtype: int64

In [8]:
# keep just the Id and sequence in the data frame
df_seq_id = df_train[["Id", "sequence", "partition"]]
df_seq_id.head()


Unnamed: 0,Id,sequence,partition
0,0,MMRFMLLFSRQGKLRLQKWYLATSDKERKKMVRELMQVVLARKPKM...,0
1,1,MSATYTNTITQRRKTAKVRQQQQHQWTGSDLSGESNERLHFRSRST...,0
2,2,MPRGDSEQVRYCARFSYLWLKFSLIIYSTVFWLIGALVLSVGIYAE...,3
3,3,MGRSLTCPFGISPACGAQASWSIFGVGTAEVPGTHSHSNQAAAMPH...,0
4,4,MDGQKKNWKDKVVDLLYWRDIKKTGVVFGASLFLLLSLTVFSIVSV...,1


In [35]:
# check partition values
df_seq_id.iloc[:,2].unique()


array([0, 3, 1, 2])

In [None]:
# truncate function to 512aa either end
def truncate_prot(seq, chunk_len = 256):
  seq = str(seq)
  if len(seq) <= chunk_len*2:

    # short seq, return whole
    return seq

  else:
    # take first 2048 and last 2048 amino acids of sequence
    return seq[:chunk_len] + seq[-chunk_len:]


In [23]:
# sanity check that this is sequence column
df_seq_id.iloc[:, 1]


0        MMRFMLLFSRQGKLRLQKWYLATSDKERKKMVRELMQVVLARKPKM...
1        MSATYTNTITQRRKTAKVRQQQQHQWTGSDLSGESNERLHFRSRST...
2        MPRGDSEQVRYCARFSYLWLKFSLIIYSTVFWLIGALVLSVGIYAE...
3        MGRSLTCPFGISPACGAQASWSIFGVGTAEVPGTHSHSNQAAAMPH...
4        MDGQKKNWKDKVVDLLYWRDIKKTGVVFGASLFLLLSLTVFSIVSV...
                               ...                        
13393    MESLPARLFPGLSIKIQRSNGLIHSANISTVNVEKSCVSVEWIEGG...
13394    MESLRGYTHSDIGYRSLAVGEDIEEVNDEKLTVTSLMARGGEDEEN...
13395    MESLVDGDGFPDLEEDEDIDQFNDDTFGAGAVDDDWREEHERLAEM...
13396    MESNFNQEGVPRPSYVFSADPIARPSEINFDGIKLDLSHEFSLVAP...
13397    MESKALLVLTLAVWLQSLTASRGGVAAADQRRDFIDIESKFALRTP...
Name: sequence, Length: 13398, dtype: str

## Fine tuning ESM2


In [None]:
df_seq_id

# need to space separate the sequences for tokeniser per sequence
def space_aa(seq):
    return " ".join(list(seq))

# apply space join and add to list, so end with list of space separated and truncated seqs
seqs = (
    df_seq_id.iloc[:, 1]
    .dropna()
    .astype(str)

    # apply truncate sequence function
    .apply(truncate_prot)

    # apply space function to space out seqs
    .apply(space_aa)
    .tolist()
) 

# sanity check
seqs[23]


'M E S P F S P V L P H G P G E D W E S T L F A E L G Y F T D T D D V Q F D A A H E T Y E N N F D H L N F D L D L M P W E S D I W S P S S H F C S D I K A E P Q P L S P A S S S C S V S S P R S T D S C S S T Q H V P E E L D L L S S S Q S P L S L Y G D S C H S P S S A E P L K E E K P V T G P G N K T E H G L T P K K K I Q M S S K P S V Q P K P L L L P A A P K T P A N A S V P A K T I I I Q T L P A L M P L A K Q Q S I I S I Q P A P T K G Q T V L L S Q P A V V Q L Q T P G V L P S A Q P V L A V T G G A T Q L P N H V V N V V P A P V V N S P V N G K L C V T K P V L Q S S T R S T G S D I A V L R R Q Q R M I K N R E S A C Q S R K K K K E Y M L G L E A R L K A A L S E N E Q L K K E N G S L K R Q L D Q V V S E N Q R L K V P S P K R R A V C V M I V L A F I M L N Y G P M S M L E Q D S R R V K P S V S P A N Q R R H L L E F S A K E V K D T S D G D N Q K N S Y R Y D H S V S N D K A L M V L S E E P L L Y I P P P P C Q P L I N T T E S L R L N H E L R G W V H R H E V E R T K S R R M T N S Q Q K T R I L Q G A

In [17]:
# tokenise the seqs
tokenised_seqs = tokeniser(seqs,
                           padding=True,
                           return_tensors="pt")


In [18]:
tokenised_seqs.keys()


KeysView({'input_ids': tensor([[ 0, 20, 20,  ...,  1,  1,  1],
        [ 0, 20,  8,  ...,  1,  1,  1],
        [ 0, 20, 14,  ...,  1,  1,  1],
        ...,
        [ 0, 20,  9,  ...,  1,  1,  1],
        [ 0, 20,  9,  ...,  5, 15,  2],
        [ 0, 20,  9,  ...,  1,  1,  1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])})

In [19]:
# checking shape
tokenised_seqs["input_ids"].shape


torch.Size([13398, 1026])

In [28]:
# now we want to turn this data into a dataset that PyTorch can load samples from
# can use huggingface Dataset class for this
from datasets import Dataset
dataset = Dataset.from_dict(tokenised_seqs)

print(dataset)


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 13398
})


In [29]:
# now need to add the labels as a single column length 6 vector to dataset:

# set targte cols as before
target_cols = ['cytoplasm', 'nucleus', 'extracellular', 'cell_surface', 'mitochondrion', 'endom']

# convert 6 one-hot columns (labels in data) to a single labels column:
labels = df_train[target_cols].values.tolist()

# and partitions
partitions = df_train["partition"].tolist()

# now all labels are one column but as length 6 vector
# add these labels to the hugging face Dataset 

dataset = dataset.add_column("labels", labels)
dataset = dataset.add_column("partition", partitions)


# sanity check
dataset.column_names


['input_ids', 'attention_mask', 'labels', 'partition']

In [30]:
dataset


Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'partition'],
    num_rows: 13398
})

### Model loading
Now we want to load the model and ensure the same one is loaded at the tokeniser (set in `esm2`)

Also need to set the size of our classifier head (fine tuning aspect of this is our head which is like the trainable part for our task)

`model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=6)`

The job of this is to take the encoder's embeddings and map them to our labels  
Fine tuning updates the head weights, and so the model learns our specific multi-label task.

```
What happens during fine-tuning

Input sequence → encoder → hidden embeddings

Embeddings → classifier head → logits (6-length vector)

Compute loss against your labels (BCEWithLogitsLoss)

Backprop → update head weights (and optionally encoder weights)
```

In [31]:
# num_labels tells AutoModelForSequenceClassification how big the classifier head is
num_labels = len(labels[0])

# load esm2 model
model = AutoModelForSequenceClassification.from_pretrained(esm2, num_labels=num_labels)

# do config for 6 logits per sequence output
config = AutoConfig.from_pretrained(esm2)
config.num_labels = num_labels
config.problem_type = "multi_label_classification"


model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

```Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
````

This essentially means that the main esm2 encoder has loaded but the classifier head (aka my new part) does not exist in the checkpoint as it is not included in the pre-trained model, but all fine.


Now we want to initialise our `TrainingArguments`. These control various training hyperparameters and will be passed to our `Trainer`.

In [None]:
# for saving checkpoints to a folder defined in args
model_name = esm2.split("/")[-1]

# adjust for GPU
batch_size = 16

args = TrainingArguments(
    f"{model_name}-finetuned-localization",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    push_to_hub=False,
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=1.1.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=1.1.0'`

Next define metric we will use to evaluate our models and write a `compute_metrics` function which we can load from `evaluate` library

In [None]:
#!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
from evaluate import load
import numpy as np

metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


In [None]:
print(dataset)


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 13398
})


In [38]:
# now i need to split the dataset 80/20 training/validation(test)
from datasets import DatasetDict

# set partition 3 to the validation set, and 0-2 as the training set
# and then remove partition column
train_dataset = dataset.filter(lambda x: x['partition'] in [0,1,2]).remove_columns("partition")
val_dataset   = dataset.filter(lambda x: x['partition'] == 3).remove_columns("partition")


Filter:   0%|          | 0/13398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13398 [00:00<?, ? examples/s]

In [None]:
train_dataset


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10162
})

In [40]:
# check lengths of train and validation
train_dataset_size = len(train_dataset)
val_dataset_size = len(val_dataset)


print(f"Training length: {train_dataset_size}, Validation length: {val_dataset_size}")


Training length: 10162, Validation length: 3236


And finally we can initialise our `Trainer`  

This essentially does the training for us, including handling forward passes, computing loss, and backpropogation.  It knows how to train it from our `args` and for other stuff. Like a training manager.

So it loops over epochs and batches, feeds inputs into the model, calculates loss and updates weights, runs evaluations, saves model checkpoints automatically, and tracks metrics like f1 for best model selection.


In [43]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokeniser)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)


NameError: name 'model' is not defined

In [None]:
# clear gpu cache
torch.cuda.empty_cache()

# and actually fine tuning/training our model:
trainer.train()


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/parallel/parallel_apply.py", line 99, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 918, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 913, in forward
    outputs = self.esm(
              ^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 1064, in wrapper
    outputs = func(self, *args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 748, in forward
    encoder_outputs = self.encoder(
                      ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/utils/generic.py", line 918, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 556, in forward
    hidden_states = layer_module(
                    ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_layers.py", line 94, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 526, in forward
    layer_output = self.feed_forward_chunk(attention_output)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 531, in feed_forward_chunk
    intermediate_output = self.intermediate(attention_output_ln)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 461, in forward
    hidden_states = gelu(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/esm/modeling_esm.py", line 60, in gelu
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
                      ~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 80.00 MiB. GPU 0 has a total capacity of 14.56 GiB of which 55.81 MiB is free. Including non-PyTorch memory, this process has 14.51 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 69.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
trainer.save_model("esm2_finetuned_localisation")
tokeniser.save_pretrained("esm2_finetuned_localisation")
