# Text Classification on Twitter Complaints Dataset using Prompt Tuning on Mistral-7B
* Notebook by Adam Lang
* Date: 12/16/2024

# Overview
* In this notebook we will perform text classification on a twitter complaints dataset using an LLM by leveraging the power of PEFT methods specifically "Prompt Tuning" which is a sub category of Prompt Learning PEFT techniques.

# LLM we are prompt tuning
* We will perform prompt tuning on the Mistral-7B model.
* Here is the model card: https://huggingface.co/mistralai/Mistral-7B-v0.1

# Imports

In [1]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
import wandb

## setttings
wandb.init(project="prompt_learning_methods", name="prompt_tuning")
seed = 42
device = "cuda"
## load model and tokenizer from HF
model_name_or_path = "mistralai/Mistral-7B-v0.1"
tokenizer_name_or_path = "mistralai/Mistral-7B-v0.1"
## datset inormation
dataset_name = "twitter_complaints"
text_column = "Tweet text"
label_column = "text_label"

## other settings
max_length = 64 ## max length of a tweet we will classify
lr = 1e-4
num_epochs = 10
batch_size = 8
set_seed(seed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madam-m-lang[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Dataset Preparation
* We first need to prepare the dataset.

## Load the dataset from hugging face
* https://huggingface.co/davidschulte/ESM_ought__raft_twitter_complaints

In [3]:
from datasets import load_dataset

## load dataset
dataset = load_dataset("ought/raft", dataset_name)

## setup train split
classes = [k.replace("_", " ") for k in dataset['train'].features['Label'].names]
print(classes)

## map to dataset
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
## print dataset and first index
print(dataset)
dataset["train"][0]

['Unlabeled', 'complaint', 'no complaint']


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 50
    })
    test: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 3399
    })
})


{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [4]:
# lets count the train labels
from collections import Counter 

Counter(dataset["train"]["Label"])

Counter({2: 33, 1: 17})

Summary
* As with most classification tasks we can see the target class is skewed.
* "1" is a complaint with 17 total
* "2" is non-complaint with about twice as many labels.


# Data Preprocessing

In [6]:
# load tokenizer for Mistral-7B model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) # add hf_token if not using cli
## if text doesn't have pad token --> add eos token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
# get max length of tokens
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(f"{target_max_length=}")


## data preprocessing function
def preprocess_function(examples):
    #1. get batch_size
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

target_max_length=4


Apply preprocessing function to dataset

In [7]:
## map function to dataset
processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

## set train and validation data -- same for simplicity 
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["train"]



## setup dataloader
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(train_dataloader))

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     1,   320, 12523,  2245,   714,   802,   642, 28711,   311,
           8503,  1080,  6304,   272,  9827,   354,   528,    13,  4565,   714,
          28705,   708, 22105,     2],
         [    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     1,   320, 12523,  2245,
            714,   802,  2707,  6024, 28754,   525, 28709, 28743,  4585, 15359,
           8196,   354,  1558,  4089, 28725,   829,   347,   586,  7865,   562,
           3062,  2368, 28742, 28707,  1709,   298,   347,  2739,   

Summary
* If you see -100 it means the token is ignored. 

## Preprocess Test Dataset

In [8]:
## function to preprocess test dataset
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x}\nLabel : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


## map function to test data
test_dataset = dataset["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

## setup test data loader
test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(test_dataloader))

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

{'input_ids': tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     1,   320, 12523,  2245,   714,   802, 28760,   324, 14233,
           2328,  8868,   354, 10313,   586,  7416,  2169,   395,   272,  4908,
           8147,  1309,   356,   378, 28808, 11936, 28723, 22747, 28723,   675,
          28748, 28710, 28737, 28734, 11788, 28744, 28762, 28779, 28779, 28750,
             13,  4565,   714, 28705],
         [    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
              2,     2,     2,     2,     2,     2,     1,   320, 12523,  2245,
            714,   334,   855,   288,   582,   356,   422,  3836,  3957,  1829,
          19653, 28808,   415,   905,   590,  3088,   354,  1167,  4370,   568,
           1371,   272,  8710,   472, 28705, 29137, 29137, 29274, 30

# Create PEFT Model, Optimizer and Learning Rate Scheduler

## Prompt Tuning Config
* We need to setup the prompt tuning config first. 

In [9]:
## first you need the prompt instructing the model to classify tweets
prompt_tuning_init_text="Classify if the tweet is a complaint or no complaint.\n"

## config -- init soft prompt token with embeddings for the prompt above
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, #predicting next word as complaint vs. no complaint
    prompt_tuning_init=PromptTuningInit.TEXT, ## instruction prompt to classify text
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_name_or_path,
)

## Create Model Checkpoints

In [10]:
# creating model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16) #load in half-precision
model = get_peft_model(model, peft_config) ##base model and peft_config
model.print_trainable_parameters()
## enable gradient checkpointing
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant":False})
model = model.to(device)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 61,440 || all params: 7,241,793,536 || trainable%: 0.0008


### Summary
* We can see the trainable params are only 61,440 compared to the full 7.2B params in the FULL Mistral model.
* And that works out to be 0.008%

In [11]:
## lets see what the model parameters are
model

PeftModelForCausalLM(
  (base_model): MistralForCausalLM(
    (model): MistralModel(
      (embed_tokens): Embedding(32000, 4096)
      (layers): ModuleList(
        (0-31): 32 x MistralDecoderLayer(
          (self_attn): MistralSdpaAttention(
            (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
            (rotary_emb): MistralRotaryEmbedding()
          )
          (mlp): MistralMLP(
            (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
  

Summary
* We can see the `prompt_encoder` above has 15 embeddings which are for the input prompt which will be part of the soft prompt finetuning.

## Optimizer and Learning Rate Scheduler
* Here just as with any deep learning model, we setup the optimizer and learning rate scheduler.

In [12]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.1)

## learning_rate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0, #no warm up due to only updating subset of weights
    num_training_steps=(len(train_dataloader) * num_epochs),
)

# Qualitative Evaluation on Test Samples BEFORE finetuning

In [13]:
model.eval()
i = 33 ## random index 33
## tokenize input
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")

## get prediction without gradients 
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=20, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand &gt; different sizing
Label : 1





















### Summary
* What we see above is 2 things:
1. The label assigned to this tweet we are not even sure if that is correct or not.
2. The model is continuing causal generation rather than classification. This should happen because the base model Mistral is not pretrained for classifying tweets it is a generative AI decoder model, hence why we are going to fine tune it. 

# Training and Evaluation Loop
* Here I am using a custom written PyTorch training and test loops rather than the Trainer API templates from hugging face.
* This is the "boiler plate code" approach.

In [15]:
# training and evaluation
for epoch in range(num_epochs):
    ## 1. train
    model.train()
    total_loss = 0
    ## 2. Forward Pass
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.autocast(dtype=torch.float16, device_type="cuda"):
            outputs = model(**batch)
    ## 3. Calculate loss
        loss = outputs.loss
        total_loss += loss.detach().float()
    ## 4. Backpropagation
        loss.backward()
    ## 5. Optimizer step -- update model params after gradients computed
        optimizer.step()
        lr_scheduler.step()
    ## 5. Zero out gradients -- avoid gradient accumulation
        optimizer.zero_grad()

    # Testing loop
    # 1. eval
    model.eval()
    eval_loss = 0
    eval_preds = []
    ## 2. forward pass 
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
    ## 3. Calculate loss
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    ## print out whats happening!!
    print(f"{epoch=}: {train_ppl=} | {train_epoch_loss=} {eval_ppl=} | {eval_epoch_loss=}")
    wandb.log({"train": {"perplexity": train_ppl, "loss": train_epoch_loss, "epoch": epoch}, 
               "val": {"perplexity": eval_ppl, "loss": eval_epoch_loss, "epoch": epoch}})
    

100%|██████████| 7/7 [00:02<00:00,  3.32it/s]
100%|██████████| 7/7 [00:00<00:00, 10.14it/s]


epoch=0: train_ppl=tensor(1.0870, device='cuda:0') | train_epoch_loss=tensor(0.0834, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.35it/s]
100%|██████████| 7/7 [00:00<00:00, 10.07it/s]


epoch=1: train_ppl=tensor(1.0717, device='cuda:0') | train_epoch_loss=tensor(0.0692, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.38it/s]
100%|██████████| 7/7 [00:00<00:00,  9.96it/s]


epoch=2: train_ppl=tensor(1.0751, device='cuda:0') | train_epoch_loss=tensor(0.0724, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.36it/s]
100%|██████████| 7/7 [00:00<00:00, 10.03it/s]


epoch=3: train_ppl=tensor(1.0929, device='cuda:0') | train_epoch_loss=tensor(0.0889, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.20it/s]
100%|██████████| 7/7 [00:00<00:00,  9.97it/s]


epoch=4: train_ppl=tensor(1.0831, device='cuda:0') | train_epoch_loss=tensor(0.0798, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.29it/s]
100%|██████████| 7/7 [00:00<00:00,  9.96it/s]


epoch=5: train_ppl=tensor(1.0709, device='cuda:0') | train_epoch_loss=tensor(0.0685, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.32it/s]
100%|██████████| 7/7 [00:00<00:00,  9.95it/s]


epoch=6: train_ppl=tensor(1.0696, device='cuda:0') | train_epoch_loss=tensor(0.0673, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.32it/s]
100%|██████████| 7/7 [00:00<00:00,  9.91it/s]


epoch=7: train_ppl=tensor(1.0713, device='cuda:0') | train_epoch_loss=tensor(0.0689, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.32it/s]
100%|██████████| 7/7 [00:00<00:00,  9.84it/s]


epoch=8: train_ppl=tensor(1.0763, device='cuda:0') | train_epoch_loss=tensor(0.0735, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  3.09it/s]
100%|██████████| 7/7 [00:00<00:00,  9.92it/s]

epoch=9: train_ppl=tensor(1.0698, device='cuda:0') | train_epoch_loss=tensor(0.0675, device='cuda:0') eval_ppl=tensor(1.0709, device='cuda:0') | eval_epoch_loss=tensor(0.0685, device='cuda:0')





# Qualitative Evaluation on Test Samples AFTER Fine Tuning

In [16]:
model.eval()
i = 33
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=5, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=False)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand &gt; different sizing
Label :  complaint</s>


### Summary
* Now let's compare the fine tuned model to the original tweet #33.
* We can see that it has labeled the same tweet as a "complaint" which previously the model was not able to do. 

# Option 1 - Save Model and Push to hugging face hub

In [18]:
from huggingface_hub import notebook_login, logout

logout() # Log out first
notebook_login() # Then log in again

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
# model.push_to_hub(
#     f"mistral_prompt_tuning",
#     token = "hf_..."
# )


# # save our model to the Hugging Face hub
# model_upload_url = model.push_to_hub(
#     commit_message="Uploading PEFT Prompt Tuned Mistral-7B for Text Classification",
#     token = "",
# )
# # print if successful upload
# print(f"[INFO] Model successfully uploaded to the Hugging Face Hub with URL: {model_upload_url}")
     


# Option 2 - Save model local

In [28]:
# saving model
peft_model_id = "mistral_prompt_tuning"
model.push_to_hub(peft_model_id, private=True)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/adamNLP/mistral_prompt_tuning/commit/b6330b2d3089ac752a7190126716592d8355248e', commit_message='Upload model', commit_description='', oid='b6330b2d3089ac752a7190126716592d8355248e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/adamNLP/mistral_prompt_tuning', endpoint='https://huggingface.co', repo_type='model', repo_id='adamNLP/mistral_prompt_tuning'), pr_revision=None, pr_num=None)

In [29]:
!nvidia-smi

Mon Dec 16 19:21:02 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


|   0  NVIDIA RTX A6000               On  |   00000000:29:00.0 Off |                  Off |
| 30%   43C    P2             76W /  300W |   15042MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
+-----------------------------------------------------------------------------------------+


# Load the model Checkpoint and Perform Qualitative Analysis on Test Samples
* We can load it from our own huggingface space. 

In [34]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch

## load same dataset we fine tuned on 
dataset = load_dataset("ought/raft", "twitter_complaints")

## load custom model that I finetuned from HF
peft_model_id = "adamNLP/mistral_prompt_tuning"
device = "cuda" ## put on GPU
text_column = "Tweet text"
label_column = "text_label"

## load config
config = PeftConfig.from_pretrained(peft_model_id)

## load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Run Test Inference

Test Example 1

In [35]:
model.to(device)
model.eval()
i = 36
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")
# print(dataset["test"][i]["Tweet text"])

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tweet text : @virginmedia Instead of spending money on advertising, why not fix the slow speeds in the RG2 area. CLOWNS
Label :  complaint


Summary:
* Clearly the model was able to classify this tweet as a complaint.

Test Example 2

In [37]:
model.to(device)
model.eval()
i = 22
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]}\nLabel : ', return_tensors="pt")
# print(dataset["test"][i]["Tweet text"])

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=tokenizer.eos_token_id
    )
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tweet text : @NFLUK @Patriots That's right, #OnlyInTheNFL will the refs call an obvious TD incomplete and ruin a great game.
Label :  no complaint


Summary:
* Clearly this tweet is not a complaint.

In [36]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Dec 16 19:26:53 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               On  |   00000000:29:00.0 Off |                  Off |
| 30%   41C    P2             86W /  300W |   42210MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                