In [88]:
## in this we will fine-tune sentiment analysis on sentiment_analysis using lora
## We will load model and model config from hub 

In [1]:
import datasets
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer

from peft import LoraConfig



  from .autonotebook import tqdm as notebook_tqdm
2024-05-09 01:10:35.901423: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-09 01:10:36.174312: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 01:10:36.174362: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 01:10:36.219988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-09 01:10:36.322868: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-09 01:10:36.324948:

### first we will train the model on pretrained model; and then only on lora adapters; we will observe the time difference; accuracy; model_parameters

In [2]:
seed=101


In [3]:
###
peft_model_name = 'roberta-base-peft'
modified_base = 'roberta-base-modified'
base_model = 'roberta-base'

In [4]:
tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path=base_model)

In [5]:
dataset=datasets.load_dataset('ag_news')

In [6]:
import numpy as np
num_labels=np.unique(dataset['train']['label']).shape[0]

In [7]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}

In [8]:
def tokenize_example(example_dataset):
    text=example_dataset['text']
    return tokenizer(text,padding=True,truncation=True)


In [9]:
tokenize_dataset=dataset.map(tokenize_example,
                             batched=True,
                            remove_columns=['text'])

In [10]:
num_labels = dataset['train'].features['label'].num_classes
classnames=tokenize_dataset['train'].features['label'].names
print(f"number of labels: {num_labels}")
print(f"the labels: {classnames}")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [11]:
id2label={i:label for i,label in enumerate(classnames)}
print(f'id2label: {id2label}')

id2label: {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}


In [16]:
from transformers import DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer,padding=True,return_tensors='pt') ## we are again padding even though there is already padding in above tokenizer.map(batched=True); This is because
# in map padding is done in a fixed batched size; however; during training using Trainer().train() different batch_size would have been present so we wanna make sure it is agiain padded when we are generating batches
##during training

In [17]:
tokenize_dataset['train'].features

{'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [18]:
#tokenize_dataset.rename_column('label','labels') ## Hugging model take labels instead

In [19]:
train_dataset=tokenize_dataset['train'].shuffle(seed=seed).select(range(1000))
eval_dataset=tokenize_dataset['test'].shuffle(seed=seed).select(range(1000))

In [20]:
## 

training_args=TrainingArguments(output_dir='../saved_weight/roberta_base_seq_cls/')

In [21]:
## defining the compute_metrics
import evaluate

metrics=evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits,label=eval_pred
    pred_label=np.argmax(logits,axis=-1)
    return metrics.compute(predictions=pred_label,references=label)


In [22]:
## defining rhe model
model=AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=base_model,id2label=id2label)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [24]:
total_trainable_parameters=sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'total trainable parameters: {total_trainable_parameters}')


total trainable parameters: 124648708


In [25]:
trainer=Trainer(model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                data_collator=data_collator,
                compute_metrics=compute_metrics)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
trainer.train() ## training on small dataset

Step,Training Loss


TrainOutput(global_step=375, training_loss=0.3586973876953125, metrics={'train_runtime': 99.8682, 'train_samples_per_second': 30.04, 'train_steps_per_second': 3.755, 'total_flos': 472773833807808.0, 'train_loss': 0.3586973876953125, 'epoch': 3.0})

In [27]:
trainer.evaluate()

{'eval_loss': 0.4470037817955017,
 'eval_accuracy': 0.903,
 'eval_runtime': 8.6739,
 'eval_samples_per_second': 115.288,
 'eval_steps_per_second': 14.411,
 'epoch': 3.0}

#### Now we will utilize LoRA adapter to train the adapter only and re train and evaluate again 



In [29]:
from peft import get_peft_model
from peft import LoraConfig

In [43]:
model=AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=base_model,id2label=id2label)
lora_config=LoraConfig(peft_type='SEQ_CLS',inference_mode=False,r=8,lora_alpha=16,lora_dropout=0.1)
peft_model=get_peft_model(model=model,peft_config=lora_config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
peft_model

PeftModel(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                    

In [45]:
peft_model.print_trainable_parameters() ## here only 23% datas are trainable

trainable params: 294,912 || all params: 124,943,620 || trainable%: 0.23603606170527155


In [46]:
args_lora=TrainingArguments(output_dir='../saved_weight/roberta_base_seq_cls_lora/')

In [47]:
lora_trainer=Trainer(model=peft_model,
                     args=args_lora,
                     train_dataset=train_dataset,
                     eval_dataset=eval_dataset,
                     compute_metrics=compute_metrics,
                     data_collator=data_collator)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [48]:
lora_trainer.train()

Step,Training Loss


TrainOutput(global_step=375, training_loss=1.39217041015625, metrics={'train_runtime': 89.8275, 'train_samples_per_second': 33.397, 'train_steps_per_second': 4.175, 'total_flos': 474401705580480.0, 'train_loss': 1.39217041015625, 'epoch': 3.0})

In [42]:
lora_trainer.evaluate()

{'eval_runtime': 9.1545,
 'eval_samples_per_second': 109.236,
 'eval_steps_per_second': 13.654,
 'epoch': 3.0}

In [55]:
###saving the model and peft
tokenizer_saved_dire='../saved_weight/'+modified_base
peft_model_saved_dire='../saved_weight/roberta_base_lora'

In [52]:

tokenizer.save_pretrained(tokenizer_saved_dire)

('../saved_weight/roberta-base-modified/tokenizer_config.json',
 '../saved_weight/roberta-base-modified/special_tokens_map.json',
 '../saved_weight/roberta-base-modified/vocab.json',
 '../saved_weight/roberta-base-modified/merges.txt',
 '../saved_weight/roberta-base-modified/added_tokens.json',
 '../saved_weight/roberta-base-modified/tokenizer.json')

In [53]:
peft_model.save_pretrained(peft_model_saved_dire)

In [56]:
## loading the saved model
inf_model=AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=peft_model_saved_dire,id2label=id2label)
tokenizer=AutoTokenizer.from_pretrained(tokenizer_saved_dire)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
### test the model

def classify(text):
    inputs=tokenizer(text,padding=True,truncation=True,return_tensors='pt')
    output=inf_model(**inputs)
    prediction=output.logits.argmax(dim=-1).item()
    print(f'text: {text} | Class: {prediction}, label={id2label[prediction]}')

In [60]:
classify( "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")

text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ... | Class: 1, label=Sports


In [65]:
classify( "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.") ## only trained with 1000 sample so 

text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again. | Class: 1, label=Sports
