In [7]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, RandomSampler , Dataset
from tqdm import tqdm
import pandas as pd
from torchmetrics import Accuracy
from transformers import BertConfig, BertModelWithHeads
import pandas as pd
model_name = "Rostlab/prot_bert"
adapter_name = "sequence_localization"
max_length=1024
batch_size = 2
loader_workers=2

In [17]:

label2id= {
    "Cell.membrane": 0,
    "Cytoplasm-Nucleus": 1,
    "Cytoplasm": 2,
    "Endoplasmic.reticulum": 3,
    "Golgi.apparatus": 4,
    "Lysosome/Vacuole": 5,
    "Mitochondrion": 6,
    "Nucleus": 7,
    "Peroxisome": 8,
    "Plastid": 9,
    "Extracellular": 10
  }
id2label={
    "0": "Cell.membrane",
    "1": "Cytoplasm-Nucleus",
    "2": "Cytoplasm",
    "3": "Endoplasmic.reticulum",
    "4": "Golgi.apparatus",
    "5": "Lysosome/Vacuole",
    "6": "Mitochondrion",
    "7": "Nucleus",
    "8": "Peroxisome",
    "9": "Plastid",
    "10": "Extracellular"
  }

vocab_size = len(id2label.keys())

In [29]:

class ProteinSequenceDataset(Dataset):
    def __init__(self, sequence, targets, tokenizer, max_len):
        self.sequence = sequence
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, item):
        sequence = str(self.sequence[item])
        target = self.targets[item]
        
        target_number = label2id[target]
        encoding = self.tokenizer.encode_plus(
            sequence,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
          'protein_sequence': sequence,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(target_number, dtype=torch.long)
        }

In [30]:
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)

loading file vocab.txt from cache at /Users/ahora/.cache/huggingface/hub/models--Rostlab--prot_bert/snapshots/3d05bf06e79014892defacad82e0efd06e977ff6/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/ahora/.cache/huggingface/hub/models--Rostlab--prot_bert/snapshots/3d05bf06e79014892defacad82e0efd06e977ff6/special_tokens_map.json
loading file tokenizer_config.json from cache at /Users/ahora/.cache/huggingface/hub/models--Rostlab--prot_bert/snapshots/3d05bf06e79014892defacad82e0efd06e977ff6/tokenizer_config.json
loading configuration file config.json from cache at /Users/ahora/.cache/huggingface/hub/models--Rostlab--prot_bert/snapshots/3d05bf06e79014892defacad82e0efd06e977ff6/config.json
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,

In [31]:
df_train = pd.read_csv('./data/deeploc_per_protein_train.csv',names=['input','loc','membrane'],skiprows=1)
df_test = pd.read_csv('./data/deeploc_per_protein_test.csv',names=['input','loc','membrane'],skiprows=1)

seq_train = list(df_train['input'])
label_train = list(df_train['loc'])

seq_test = list(df_test['input'])
label_test = list(df_test['loc'])

train_dataset = ProteinSequenceDataset(seq_train , label_train ,tokenizer , max_length )
test_dataset = ProteinSequenceDataset(seq_test , label_test ,tokenizer , max_length )

In [32]:

train_loader =  DataLoader(
            dataset=train_dataset,
            batch_size=batch_size , 
            num_workers=loader_workers
        )

test_loader =  DataLoader(
            dataset=train_dataset,
            batch_size=batch_size , 
            num_workers=loader_workers
        )

In [33]:

config = BertConfig.from_pretrained(
    model_name , 
    id2label=id2label,
)
model = BertModelWithHeads.from_pretrained(
  model_name , 
    config=config,
)

model.add_adapter(adapter_name , config="houlsby")
model.add_classification_head(adapter_name , num_labels=vocab_size)
model.set_active_adapters(adapter_name)
model.train_adapter_fusion(adapter_name)

loading configuration file config.json from cache at /Users/ahora/.cache/huggingface/hub/models--Rostlab--prot_bert/snapshots/3d05bf06e79014892defacad82e0efd06e977ff6/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "id2label": {
    "0": "Cell.membrane",
    "1": "Cytoplasm-Nucleus",
    "10": "Extracellular",
    "2": "Cytoplasm",
    "3": "Endoplasmic.reticulum",
    "4": "Golgi.apparatus",
    "5": "Lysosome/Vacuole",
    "6": "Mitochondrion",
    "7": "Nucleus",
    "8": "Peroxisome",
    "9": "Plastid"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1

In [34]:
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=100,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    do_eval=True , 
    gradient_accumulation_steps=4 , 
    seed=47 , 
    fp16=True , 
    report_to="tensorboard" , 
    logging_dir="./logdir" , 
    warmup_ratio=0.2
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [35]:
trainer.train()

***** Running training *****


  Num examples = 6622
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 16555
  Number of trainable parameters = 1060875


  0%|          | 0/16555 [00:00<?, ?it/s]