In [None]:
#Import necessary libraries
import os
import torch
import evaluate
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from transformers import DataCollatorWithPadding,DataCollatorForSeq2Seq
from transformers import AutoTokenizer, GPT2LMHeadModel,TrainingArguments, Trainer,GPT2Config,EarlyStoppingCallback

In [None]:
#Use GPU else specify '-1' for CPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
#Load training data
data=pd.read_csv('training.csv')

In [3]:
data

Unnamed: 0,Seq,Label
0,SRPVAVETALLYRTITTGEQGRGRSSVYSCPQDPLGAIYSRDALSK...,1
1,FKAGAERKEAAESTLVAYKSASDIATAELAPTHPIRLGLALNFSVF...,1
2,DEAGDDIKEAPKEVQKVDEQAQPPPSQ,1
3,DMQDDVADDIKEAAPAAAKPADEQQS,1
4,DSGNGGWDNWDNDDSFRSTDMRRNQSAGDFRSSGGRGAPAKSKSSE...,1
...,...,...
253119,NSKNTVEMQSILHNTVLLFMICFEVYMLSVVWRAFVYICDFNMQRQ...,0
253120,DFNMQRQIEKIVQKKSMVKRSFDIEYDLVRNEIIRAEVKANEELV,0
253121,MNLLPYQFVGEVVRGFGRGGKELGCPTANMD,0
253122,ELGCPTANMDGTVVNGLPEGLPVGVYFGTAKLDGKSYKMAMSIGWN...,0


In [4]:
#Check the positive and negative labels
data['Label'].value_counts()

0    138488
1    114636
Name: Label, dtype: int64

In [5]:
#Remove \n and - characters from the sequence
data['Seq']=data['Seq'].str.replace('-','')
data['Seq']=data['Seq'].str.replace('\n','')

In [6]:
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2',bos_token='<startoftext>',eos_token='<endoftext>',pad_token='<PAD>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
#Add custom tokens
tokenizer.add_tokens(['POSITIVE','NEGATIVE'])

2

In [8]:
tokenizer.special_tokens_map

{'bos_token': '<startoftext>',
 'eos_token': '<endoftext>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<PAD>'}

In [10]:
#Map positive/negative labels and prepare prompt for training
class SequenceClassificationDataset(Dataset):
    def __init__(self, sequences, labels, tokenizer,dtype='Train'):
        self.sequences = sequences
        self.labels = labels
        self.tokenizer = tokenizer
        self.map_label={1:'POSITIVE',0:'NEGATIVE'}
        self.dtype='Train'
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        prep_txt1= f'<startoftext>{sequence}\n{self.map_label[label]}<endoftext>'
        encoding1 = self.tokenizer(prep_txt1,return_tensors='pt')
        return {
            'input_ids': encoding1['input_ids'].squeeze(), 
            'attention_mask': encoding1['attention_mask'].squeeze(), 
            'labels': encoding1['input_ids'].squeeze()
        }

In [12]:
train_texts=data['Seq'].reset_index(drop=True)
train_labels=data['Label'].reset_index(drop=True)

In [13]:
train_dataset=SequenceClassificationDataset(train_texts,train_labels,tokenizer,'Train')

In [15]:
#Load the pre-trained model
model_config = GPT2Config.from_pretrained('nferruz/ProtGPT2')

In [16]:
training_args = TrainingArguments(
    output_dir='/media/8TB_hardisk/results-Prompt4/',  # output directory
    num_train_epochs=200,  # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='logs/',
    save_steps=500,
    logging_steps=500
    save_total_limit=10 #no. of models to save in the output directory
)

In [17]:
model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2',config=model_config,ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))

Embedding(50262, 1280)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,padding='longest')
)

In [None]:
trainer.train()