In [1]:

from datasets import load_dataset

In [2]:
# !pip install --upgrade transformers
# !pip install -U datasets fsspec

In [3]:
dataset = load_dataset("rotten_tomatoes")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [5]:
from transformers import(
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

In [6]:
model_name = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels =  2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': ['pad']})
  model.resize_token_embeddings(len(tokenizer))
else:
  print('pad')

pad


In [8]:
def fun(example):
  tokenizer.truncattion_side = 'left'

  text = example['text']
  return tokenizer(text, return_tensors='np', truncation=True, max_length=512)


In [9]:
tokenized_data = dataset.map(fun, batched=True)

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [10]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [11]:
from transformers import DataCollatorWithPadding

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
!pip install evaluate



In [14]:

import evaluate
accuracy = evaluate.load('accuracy')

In [15]:
from peft import LoraConfig, get_peft_model
peft_config =  LoraConfig(
    task_type='SEQ_CLS',
    r =4,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=['q_lin']  #query for bert model and q_lin for distilbert
)
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=4, target_modules={'q_lin'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [16]:
# model.print_trainable_parameters()
model = get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [17]:
def compute_matrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis = 1)
  return {'accuracy': accuracy.compute(predictions=predictions, references=labels)}

In [18]:

training_arguments = TrainingArguments(
    output_dir=model_name + '-lora',
    learning_rate= 1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    # save_strategy='epoch',
    # load_best_model=True
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_matrics
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33masif-cs-ai[0m ([33masif-cs-ai-north-south-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.4253
1000,0.3619
1500,0.3136
2000,0.2692
2500,0.2475
3000,0.2147
3500,0.1959
4000,0.1667
4500,0.1442
5000,0.1275


TrainOutput(global_step=5340, training_loss=0.2393012214689219, metrics={'train_runtime': 279.1545, 'train_samples_per_second': 305.566, 'train_steps_per_second': 19.129, 'total_flos': 1092417303283200.0, 'train_loss': 0.2393012214689219, 'epoch': 10.0})

In [21]:
import torch
import numpy as np # Import numpy for argmax

# Define id2label based on the dataset labels
# Assuming label 0 is negative and label 1 is positive based on common sentiment datasets
id2label = {0: "NEGATIVE", 1: "POSITIVE"}


# Get the device of the model
device = model.device
print(f"Model is on device: {device}")

# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("trained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text and move to the model's device
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    # compute logits
    with torch.no_grad(): # Use no_grad for inference to save memory and speed up
        logits = model(inputs).logits
    # convert logits to label on the model's device, then move to cpu for printing
    predictions = torch.argmax(logits, dim=-1).cpu().tolist()


    print(text + " - " + id2label[predictions[0]])

Model is on device: cuda:0
trained model predictions:
----------------------------
It was good. - POSITIVE
Not a fan, don't recommed. - NEGATIVE
Better than the first one. - NEGATIVE
This is not worth watching even once. - NEGATIVE
This one is a pass. - POSITIVE


In [22]:
for name, param in model.named_parameters():
    print(name)

base_model.model.distilbert.embeddings.word_embeddings.weight
base_model.model.distilbert.embeddings.position_embeddings.weight
base_model.model.distilbert.embeddings.LayerNorm.weight
base_model.model.distilbert.embeddings.LayerNorm.bias
base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.weight
base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.bias
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_A.default.weight
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_B.default.weight
base_model.model.distilbert.transformer.layer.0.attention.k_lin.weight
base_model.model.distilbert.transformer.layer.0.attention.k_lin.bias
base_model.model.distilbert.transformer.layer.0.attention.v_lin.weight
base_model.model.distilbert.transformer.layer.0.attention.v_lin.bias
base_model.model.distilbert.transformer.layer.0.attention.out_lin.weight
base_model.model.distilbert.transformer.layer.0.attention.out_lin.bias
base