In [1]:
import peft
import requests
from io import BytesIO
import pandas as pd
import torch

import transformers
import evaluate 
import datasets
import requests
import pandas
import sklearn
from datasets import Dataset
from transformers import TrainingArguments, Trainer


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
2024-04-19 16:10:20.364962: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-19 16:10:21.611944: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-04-19 16:10:21.611975: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-04-19 16:10:24.610872: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loa

In [2]:
transformers.__version__

'4.39.0'

In [3]:
model_checkpoint = 'facebook/esm2_t6_8M_UR50D' # This is the smallest of the ESM2 models: 6 layers, 8M params. 
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Let's download some data for a protein binary classification problem. In this case, we will attempt to predict whether a protein lives iinside a cell or on its membrane. 

In [4]:
query_url ="https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Csequence%2Ccc_subcellular_location&format=tsv&query=%28%28organism_id%3A9606%29%20AND%20%28reviewed%3Atrue%29%20AND%20%28length%3A%5B80%20TO%20500%5D%29%29"
uniprot_request = requests.get(query_url)
bio = BytesIO(uniprot_request.content)
df = pandas.read_csv(bio, compression='gzip', sep='\t')
df['seq_len'] = list(map(len, df.Sequence))
df = df.dropna()
df.sort_values('seq_len', ascending = False)
df['ind'] = list(df.index)
cytosolic = df['Subcellular location [CC]'].str.contains("Cytosol") | df['Subcellular location [CC]'].str.contains("Cytoplasm")
membrane = df['Subcellular location [CC]'].str.contains("Membrane") | df['Subcellular location [CC]'].str.contains("Cell membrane")
cytosolic_df = df[cytosolic & ~membrane]
cytosolic_df['label'] = 0
membrane_df = df[membrane & ~cytosolic]
membrane_df['label'] = 1
df = pd.concat([cytosolic_df, membrane_df]).sort_values('ind').sample(frac = 1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cytosolic_df['label'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  membrane_df['label'] = 1


Unnamed: 0,Entry,Sequence,Subcellular location [CC],seq_len,ind,label
3619,Q14714,MGKNKQPRGQQRQGGPPAADAAGPDDMEPKKGTGAPKECGEEEPRT...,SUBCELLULAR LOCATION: Cell membrane {ECO:00002...,243,3619,1
907,O95971,MLLEPGRGCCALAILLAIVDIQSGGCINITSSASQEGTRLNLICTV...,SUBCELLULAR LOCATION: [CD160 antigen]: Cell me...,181,907,1
5146,Q8TDB4,MYLRRAVSKTLALPLRAPPNPAPLGKDASLRRMSSNRFPGSSGSNM...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00002...,240,5146,0
11492,Q8ND94,MLGSPCLLWLLAVTFLVPRAQPLAPQDFEEEEADETETAWPPLPAV...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,238,11492,1
1594,P14550,MAASCVLLHTGQKMPLIGLGTWKSEPGQVKAAVKYALSVGYRHIDC...,"SUBCELLULAR LOCATION: Cytoplasm, cytosol {ECO:...",325,1594,0


In [5]:
len(df)

5149

Let's try passing a sequence through the pre-trained model

# PEFT using GaLoRE

In [6]:
df = df[['Sequence','label']]

In [7]:
df.groupby('label').size()

label
0    2599
1    2550
dtype: int64

In [8]:
import torch
import datasets
from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
# import trl

target_modules = []
for layer in range(6):
    for elem in ['query','key','value']:
        target_modules.append("esm.encoder.layer."+str(layer)+".attention.self." + elem)
model_name = model_checkpoint.split('/')[-1]
batch_size = 8
args = TrainingArguments(
    f"{model_name}-lora-finetuned-localization",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    optim="galore_adamw",
    optim_target_modules=target_modules
)




In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = len(set(df.label)))

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Let us create a training and test dataset from df, and also let us tokenize it

In [10]:
len(df), df.columns

(5149, Index(['Sequence', 'label'], dtype='object'))

In [11]:
sequences = list(df.Sequence)
labels = list(df.label)

# Quick check to make sure we got it right
len(sequences) == len(labels)

True

In [12]:
from sklearn.model_selection import train_test_split

train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, labels, test_size=0.25, shuffle=True)

In [13]:
type(train_sequences), type(test_sequences), type(train_labels), type(test_labels)

(list, list, list, list)

In [14]:
len(train_sequences), len(test_sequences), len(train_labels), len(test_labels)

(3861, 1288, 3861, 1288)

In [15]:
train_tokenized = tokenizer(train_sequences)
test_tokenized = tokenizer(test_sequences)

In [16]:
z = tokenizer(train_sequences[0])
type(z), len(z)

(transformers.tokenization_utils_base.BatchEncoding, 2)

In [17]:
len(z['input_ids'])

340

In [18]:
train_dataset = Dataset.from_dict(train_tokenized)
test_dataset = Dataset.from_dict(test_tokenized)

In [19]:
train_dataset = train_dataset.add_column("labels", train_labels)
test_dataset = test_dataset.add_column("labels", test_labels)
train_dataset, test_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 3861
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 1288
 }))

In [20]:
model_name = model_checkpoint.split("/")[1]
model_name

'esm2_t6_8M_UR50D'

In [21]:
from evaluate import load
import numpy as np

metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [22]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Activated GaLoRE fine-tuning, depending on your model size and hardware, the training might take a while before starting. Please be patient !


Epoch,Training Loss,Validation Loss
