In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,DataCollatorWithPadding, TrainingArguments,Trainer
import torch
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate

In [2]:
model_checkpoint = 'distilbert-base-uncased'

In [3]:
id2label = {0:"Negative",1:"Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels = 2, id2label=id2label, label2id = label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model

In [4]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [5]:
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [6]:
dataset['train']['text'][0]

'. . . or type on a computer keyboard, they\'d probably give this eponymous film a rating of "10." After all, no elephants are shown being killed during the movie; it is not even implied that any are hurt. To the contrary, the master of ELEPHANT WALK, John Wiley (Peter Finch), complains that he cannot shoot any of the pachyderms--no matter how menacing--without a permit from the government (and his tone suggests such permits are not within the realm of probability). Furthermore, the elements conspire--in the form of an unusual drought and a human cholera epidemic--to leave the Wiley plantation house vulnerable to total destruction by the Elephant People (as the natives dub them) to close the story. If you happen to see the current release EARTH, you\'ll detect the Elephant People are faring less well today.'

In [7]:
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

In [8]:
# Creating the distillbert tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)

In [9]:
# Adding pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token':'[PAD]'})
    model.resize_token_embedings(len(tokenizer))

In [10]:
sentences = ["Hello, how are you?", "I am fine, thank you!"]
input_sentence = tokenizer(sentences)
input_sentence

{'input_ids': [[101, 7592, 1010, 2129, 2024, 2017, 1029, 102], [101, 1045, 2572, 2986, 1010, 4067, 2017, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [11]:
tokenized_inputs_padded = tokenizer(sentences,padding=True,truncation=True,return_tensors = "pt")
tokenized_inputs_padded

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0],
        [ 101, 1045, 2572, 2986, 1010, 4067, 2017,  999,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
def tokenize_function(examples):
    text = examples['text']
    tokenizer.truncation_size = "left"
    tokenized_inputs = tokenizer(text,return_tensors="np",truncation=True,max_length = 512)
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched = True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [13]:
# Creating data collator 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
## Lora config

## LORA Config

Type of Task= Sequence Classification

r= Rank in LoRA which defines the trainable weight matrix

Alpha, dropout= We have the learning rate and dropout rate

Target= define the layer to which we apply the Lora- in our case we apply to the linear query layer

Hence we set up a LoRA adapter for parameter-efficient fine-tuning of a base model for a sequence classification task. It focuses on adapting only the q_lin module .

In [15]:
peft_config = LoraConfig(task_type="SEQ_CLS",r=4, lora_alpha=32, lora_dropout=0.01,target_modules=['q_lin'])

In [16]:
model = get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


## Sample Resutls on the pretrained model


In [17]:
# Define the examples to infer
text_list = ["It was good.","Not a fan, don't recommend.","Better than the first one.","This is not worth watching even once.","This one is a pass."]

for text in text_list:
    inputs = tokenizer.encode(text,return_tensors='pt')
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text+" - "+ id2label[predictions.tolist()])

It was good. - Negative
Not a fan, don't recommend. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


## Evaluating a model

In [24]:
accuracy = evaluate.load("accuracy")

In [25]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## Train the Model

In [18]:
len(dataset['train']['label'])

1000

In [29]:
lr = 1e-3
batch_size = 4
num_epochs = 20

In [30]:
# define trainable arguments
training_args = TrainingArguments(
    output_dir = model_checkpoint + "lora-text-classification",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)



In [31]:
# Creating the trainer object
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.436604,{'accuracy': 0.886}
2,0.336100,0.599062,{'accuracy': 0.874}
3,0.336100,0.762921,{'accuracy': 0.882}
4,0.207600,0.897397,{'accuracy': 0.874}
5,0.207600,1.078079,{'accuracy': 0.871}
6,0.124800,1.028684,{'accuracy': 0.873}
7,0.124800,1.333653,{'accuracy': 0.872}
8,0.062600,1.298608,{'accuracy': 0.878}
9,0.062600,1.095446,{'accuracy': 0.882}
10,0.062900,1.318514,{'accuracy': 0.874}




TrainOutput(global_step=5000, training_loss=0.09064136281013489, metrics={'train_runtime': 442.6766, 'train_samples_per_second': 45.18, 'train_steps_per_second': 11.295, 'total_flos': 2227175752044000.0, 'train_loss': 0.09064136281013489, 'epoch': 20.0})

## Infering the fine tuned model with an example

In [32]:
for text in text_list:
    inputs = tokenizer.encode(text,return_tensors="pt").to("cuda")
    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(text + " - " + id2label[predictions.tolist()[0]])

It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive
