# Importing Libraries and modules

In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

In [4]:
import pyarrow.dataset as ds
import pyarrow as pa

# Base Model 

Next, we load in our base model. The base model here is a relatively small one, but there are several other (larger) ones that we could have used (e.g. roberta-base, llama2, gpt2). A full list is available here.

https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForSequenceClassification

In [5]:
model_checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0: "Zero", 1: "One" , 2: "Two"}
label2id = {"Zero":0, "One":1 , "Two": 2}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load Data

In [7]:
# Load data set from huggingface
df = pd.read_csv('ratings.csv' ,  encoding='unicode_escape')
df.head()

Unnamed: 0,Topic,text,label,label_str
0,cup,cup (s),0,Zero
1,cup,bottle,0,Zero
2,cup,We calibrate with it,1,One
3,cup,It is placed with a cup of coffee,1,One
4,cup,We drink in it,2,Two


In [8]:
# remove columns
df=df.drop(['Topic','label_str'],axis=1)

In [9]:
df.head()

Unnamed: 0,text,label
0,cup (s),0
1,bottle,0
2,We calibrate with it,1
3,It is placed with a cup of coffee,1
4,We drink in it,2


# Convert our Dataset to Parquet

In [10]:
df.to_parquet('train.parquet', engine='fastparquet')
df.to_parquet('test.parquet', engine='fastparquet')
df.to_parquet('validation.parquet', engine='fastparquet')

In [12]:
# convert to dataset object
# dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())
# dataset = Dataset(pa.Table.from_pandas(df))
# dataset

Dataset({
    features: ['Definition of the topic', 'Rating'],
    num_rows: 480
})

In [11]:
# # load dataset
# dataset = load_dataset("alaatiger989/rating_system")
dataset = load_dataset("data")
dataset

Generating train split: 480 examples [00:00, 96213.43 examples/s]
Generating validation split: 480 examples [00:00, 160113.40 examples/s]
Generating test split: 480 examples [00:00, 120108.93 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 480
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 480
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 480
    })
})

# Preprocess data

Next, we need to preprocess our data so that it can be used for training. This consists of using a tokenizer to convert the text into an integer representation understood by the base model.

In [12]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

To apply the tokenizer to the dataset, we use the .map() method. This takes in a custom function that specifies how the text should be preprocessed. In this case, that function is called tokenize_function(). In addition to translating text to integers, this function truncates integer sequences such that they are no longer than 512 numbers to conform to the base model’s max input length.

In [13]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [14]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [15]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 480/480 [00:00<00:00, 22869.19 examples/s]
Map: 100%|██████████| 480/480 [00:00<00:00, 25258.33 examples/s]
Map: 100%|██████████| 480/480 [00:00<00:00, 28221.11 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
})

At this point, we can also create a data collator, which will dynamically pad examples in each batch during training such that they all have the same length. This is computationally more efficient than padding all examples to be equal in length across the entire dataset.

In [16]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation metrics

We can define how we want to evaluate our fine-tuned model via a custom function. Here, we define the compute_metrics() function to compute the model’s accuracy.

In [17]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [18]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, 
                                          references=labels)}

# Untrained model performance

Before training our model, we can evaluate how the base model with a randomly initialized classification head performs on some example inputs.

In [19]:
# define list of examples
text_list = [" It is placed with a cup of coffee.", "Something for a girl to cover her hair with.", 
"pet that we can raise at home.", "He runs and drinks.", 
"Floating."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
 It is placed with a cup of coffee. - Zero
Something for a girl to cover her hair with. - Zero
pet that we can raise at home. - Zero
He runs and drinks. - Zero
Floating. - Zero


As expected, the model performance is equivalent to random guessing. Let’s see how we can improve this with fine-tuning.

# Fine-tuning with LoRA

To use LoRA for fine-tuning, we first need a config file. This sets all the parameters for the LoRA algorithm. See comments in the code block for more details.

In [20]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.01, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only

We can then create a new version of our model that can be trained via PEFT. Notice that the scale of trainable parameters was reduced by about 100x.

In [21]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 629,763 || all params: 67,585,542 || trainable%: 0.9318013607111414


Next, we define hyperparameters for model training.

In [22]:
# hyperparameters
lr = 1e-3 # size of optimization step 
batch_size = 4 # number of examples processed per optimziation step
num_epochs = 10 # number of times model runs through training data

# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

Finally, we create a trainer() object and fine-tune the model!

In [23]:
# creater trainer object
trainer = Trainer(
    model=model, # our peft model
    args=training_args, # hyperparameters
    train_dataset=tokenized_dataset["train"], # training data
    eval_dataset=tokenized_dataset["validation"], # validation data
    tokenizer=tokenizer, # define tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, # evaluates model using compute_metrics() function from before
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.907722,{'accuracy': 0.5666666666666667}
2,No log,0.746477,{'accuracy': 0.65}
3,No log,0.483096,{'accuracy': 0.8333333333333334}
4,No log,0.286496,{'accuracy': 0.8958333333333334}
5,0.839900,0.221522,{'accuracy': 0.9125}
6,0.839900,0.170207,{'accuracy': 0.94375}
7,0.839900,0.092014,{'accuracy': 0.9625}
8,0.839900,0.095509,{'accuracy': 0.9604166666666667}
9,0.307500,0.067267,{'accuracy': 0.975}
10,0.307500,0.065783,{'accuracy': 0.9770833333333333}


TrainOutput(global_step=1200, training_loss=0.5085150941212973, metrics={'train_runtime': 808.148, 'train_samples_per_second': 5.94, 'train_steps_per_second': 1.485, 'total_flos': 13236732637920.0, 'train_loss': 0.5085150941212973, 'epoch': 10.0})

# Trained model performance

To see how the model performance has improved, let’s apply it to the same 5 examples from before.

In [25]:
# If you have an NVIDIA GPU attached, use 'cuda'
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

In [27]:
model.to(device) # moving to mps for Mac (can alternatively do 'cpu')

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [28]:
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device) # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
 It is placed with a cup of coffee. - One
Something for a girl to cover her hair with. - Two
pet that we can raise at home. - Two
He runs and drinks. - Zero
Floating. - Zero


The fine-tuned model improved significantly from its prior random guessing, correctly classifying all but one of the examples in the above code. This aligns with the ~90% accuracy metric we saw during training.