# Fine-tuning Sandbox

Code authored by: Shawhin Talebi <br>
Blog link: https://medium.com/towards-data-science/fine-tuning-large-language-models-llms-23473d763b91

In [32]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

### dataset

In [33]:
# # how dataset was generated

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000
# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N)

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [34]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [35]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

### model

In [36]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
label2id_new = {"LABEL_0":0, "LABEL_1":1}


# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [38]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [39]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [40]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [41]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [42]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [43]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [44]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


### Train model

In [45]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [46]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules=['q_lin'], lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [47]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


In [48]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [49]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [50]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.33337879180908203, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 60.9859, 'eval_samples_per_second': 16.397, 'eval_steps_per_second': 4.099, 'epoch': 1.0}
{'loss': 0.3999, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.38497304916381836, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 21.9338, 'eval_samples_per_second': 45.592, 'eval_steps_per_second': 11.398, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.43824976682662964, 'eval_accuracy': {'accuracy': 0.895}, 'eval_runtime': 21.974, 'eval_samples_per_second': 45.508, 'eval_steps_per_second': 11.377, 'epoch': 3.0}
{'loss': 0.2004, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5517623424530029, 'eval_accuracy': {'accuracy': 0.895}, 'eval_runtime': 22.0502, 'eval_samples_per_second': 45.351, 'eval_steps_per_second': 11.338, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6260715126991272, 'eval_accuracy': {'accuracy': 0.899}, 'eval_runtime': 22.0984, 'eval_samples_per_second': 45.252, 'eval_steps_per_second': 11.313, 'epoch': 5.0}
{'loss': 0.0674, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.835743248462677, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 21.9099, 'eval_samples_per_second': 45.641, 'eval_steps_per_second': 11.41, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.830268919467926, 'eval_accuracy': {'accuracy': 0.901}, 'eval_runtime': 22.1276, 'eval_samples_per_second': 45.193, 'eval_steps_per_second': 11.298, 'epoch': 7.0}
{'loss': 0.0301, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8756471872329712, 'eval_accuracy': {'accuracy': 0.894}, 'eval_runtime': 22.0016, 'eval_samples_per_second': 45.451, 'eval_steps_per_second': 11.363, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8779169917106628, 'eval_accuracy': {'accuracy': 0.897}, 'eval_runtime': 22.2146, 'eval_samples_per_second': 45.015, 'eval_steps_per_second': 11.254, 'epoch': 9.0}
{'loss': 0.0028, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8839448094367981, 'eval_accuracy': {'accuracy': 0.901}, 'eval_runtime': 21.9464, 'eval_samples_per_second': 45.565, 'eval_steps_per_second': 11.391, 'epoch': 10.0}
{'train_runtime': 968.8455, 'train_samples_per_second': 10.322, 'train_steps_per_second': 2.58, 'train_loss': 0.14010267643928528, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.14010267643928528, metrics={'train_runtime': 968.8455, 'train_samples_per_second': 10.322, 'train_steps_per_second': 2.58, 'train_loss': 0.14010267643928528, 'epoch': 10.0})

### Generate prediction

In [51]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices
    print(predictions.tolist())
    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
[1]
It was good. - Positive
[0]
Not a fan, don't recommed. - Negative
[1]
Better than the first one. - Positive
[0]
This is not worth watching even once. - Negative
[0]
This one is a pass. - Negative


### Optional: push model to hub

In [52]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [53]:
hf_name = 'StringCheese' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [54]:
model.push_to_hub(model_id) # save model

adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StringCheese/distilbert-base-uncased-lora-text-classification/commit/2a1df1dfc3bab75f192283bbe648f2cbf8b8dd63', commit_message='Upload model', commit_description='', oid='2a1df1dfc3bab75f192283bbe648f2cbf8b8dd63', pr_url=None, pr_revision=None, pr_num=None)

In [55]:
trainer.push_to_hub(model_id) # save trainer

training_args.bin:   0%|          | 0.00/4.16k [00:00<?, ?B/s]

'https://huggingface.co/StringCheese/distilbert-base-uncased-lora-text-classification/tree/main/'

### Optional: load peft model

In [56]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)er_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
print(model)
# Access the model configuration
model_config = model.config
# Print the model configuration
print(model_config)

: 

In [2]:
from transformers import Conversation, pipeline
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import gradio as gr

# Initialize a chatbot pipeline
chatbot = pipeline(task="conversational", model=model, tokenizer=tokenizer, token="")
# Initialize a sentiment analysis pipeline
# base_sentiment_model = "shawhin/distilbert-base-uncased-lora-text-classification"
base_sentiment_model = "StringCheese/distilbert-base-uncased-lora-text-classification"
sentiment_analyzer = pipeline(task="sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
message_list = []
response_list = []

def vanilla_chatbot(message, history):
    conversation = Conversation(text=message, past_user_inputs=message_list, generated_responses=response_list)
    conversation = chatbot(conversation)
    reply = conversation.generated_responses[-1]
    sentiment = sentiment_analyzer(message)
    prediction = sentiment[0]['label']

    additional_text = "Btw, you are nice :)"
    if prediction in 'LABEL_0':
        additional_text = "Btw, you are rude.."
    return f"{reply} -- {additional_text}"

bot = gr.ChatInterface(vanilla_chatbot, title="Alex's Bot", description="Say something.")
bot.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [2]:
bot.close()

Closing server running on port: 7860
