In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, set_seed


# Speciy model alias for HF
alias ="google/flan-t5-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(alias, trust_remote_code=True)

#Quantization Config
quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False
)

# Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(
    alias,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype="auto",
    quantization_config=quant_config
)

In [20]:
#load data
import pandas as pd
from datasets import Dataset
from string import Template


data = pd.read_csv("ST1_data_processed_train.csv")

starting_text = "Please Identify different types of claims(per_exp, claim_per_exp, question, claim, none) in the text:"

instruction_template = Template(
"""
# Instructions
Given a text that may contain descriptions of personal experiences, questions, and claims, classify each segment into appropriate categories: 'per_exp' for personal experiences, 'claim_per_exp' for claims based on personal experiences, 'question' for any questions posed by the author, 'claim' for factual or general claims, and 'none' for segments that do not fit into the other categories. Analyze and categorize the text based on these labels to better understand the different elements of media communication."

Example: 
Burning in chest with deep breath?\nHey everyone. Diagnosed with gerd a few years back. \nLatest issue i am having is it feels like someone is sitting on my chest at the top and the really annoying thing is burning in my chest with deep breaths. The burning is kinds constant in my throat but the deep breaths burning has me spooked.\nHad an ekg a few months back, in 30 and relatively good health but sometimes these symptoms are so alarming especially to someone who has health anxiety. \nI take rabeprazole when needed and gaviscon but this burning with deep breaths is what's scaring me. Anyone have that before and what did you find worked best?\nThanks for any and all help!

Answer:
{\'per_exp\': [\' Diagnosed with gerd a few years back. \\nLatest issue i am having is it feels like someone is sitting on my chest at the top and the really annoying thing is burning in my chest with deep breaths. The burning is kinds constant in my throat but the deep breaths burning has me spooked.\\n\', "\\nI take rabeprazole when needed and gaviscon but this burning with deep breaths is what\'s scaring me"], \'claim_per_exp\': [], \'question\': [\' Anyone have that before and what did you find worked best?\\n\'], \'claim\': [], \'none\': [\'Burning in chest with deep breath?\\nHey everyone\', \'ad an ekg a few months back, in 30 and relatively good health but sometimes these symptoms are so alarming especially to someone who has health anxiety.\', \'hanks for any and all help\']}


Input
$question
Answer:
"""
)

data["text"] = data["text"].apply(
    lambda x: instruction_template.substitute({"question" : x})
)


data = Dataset.from_pandas(data)
xx =data[0]
# Define a size for your train set 
train_dataset = data.shuffle(seed=42).select(range(4556))
test_dataset = data.shuffle(seed=42).select(range(4546, 4556))
val_dataset = data.shuffle(seed=42).select(range(5126, 5695))




In [3]:
from dataclasses import dataclass
from transformers import AutoTokenizer, BatchEncoding
from ast import literal_eval


# The collator is responsible for ensuring the generated batches have a fixed dimension as the 
#input will be tensor. 

@dataclass
class SimpleCollator:
    tokenizer: AutoTokenizer
    config: dict 
    
    def __call__(self, examples: list) -> dict:
        batch = BatchEncoding(
            {
                k: [examples[i][k] for i in range(len(examples))]
                for k, v in examples[0].items()
            }
        )

        encoded_inputs = self.tokenizer(
            batch[self.config["input_column"]], 
            max_length = 128, 
            padding=True, 
            truncation=True,
            return_tensors="pt"
        )

        encoded_targets = self.tokenizer(
            batch[self.config["output_column"]], max_length = 128, padding=True, truncation=True,
            return_tensors="pt"
        )
        encoded_inputs["labels"] = encoded_targets["input_ids"]

        return encoded_inputs

collator = SimpleCollator(tokenizer, {"input_column": "text", "output_column": "output_with_sentence"})

In [4]:
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, 
    inference_mode=False, 
    target_modules=["q", "k", "v"],
    r=4, 
    lora_alpha=32, 
    lora_dropout=0.5
)

model = get_peft_model(model, peft_config)

In [5]:
# Note that the trainable parameters are signifacntly smaller. We only training 24% of the model!
model.print_trainable_parameters()

trainable params: 663,552 || all params: 248,241,408 || trainable%: 0.2673


In [6]:
from torch.utils.data import DataLoader

# Prepare Dataloaders
train_dl = DataLoader(
    train_dataset, 
    batch_size=4,
    pin_memory=True,
    shuffle=False,
    collate_fn=collator
)

val_dl = DataLoader(
    val_dataset,
    batch_size=16,
    pin_memory=True,
    shuffle=True,
    collate_fn=collator
)

test_dl = DataLoader(
    test_dataset, 
    batch_size=16,
    pin_memory=True,
    shuffle=False,
    collate_fn=collator
)

In [7]:
import tqdm.notebook as tqdm



all_preds = []
for batch in tqdm.tqdm(test_dl, total = len(test_dl)):
    
    preds = model.generate(**batch, max_new_tokens=128)
    outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
    all_preds.extend(outputs)

# Note the FlAN T5 model ignores our instruction format and procuces the letters for prediction
all_preds

  0%|          | 0/1 [00:00<?, ?it/s]



['one', 'one']

In [8]:
import lightning as pl
from torch.optim import AdamW
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

class PeftCALMT5(pl.LightningModule):

    def __init__(self, model_alias: str, tokenizer_alias: str):

        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_alias)

        self.peft_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM, 
            inference_mode=False, 
            target_modules=["q", "k", "v"],
            r=4, 
            lora_alpha=32, 
            lora_dropout=0.5
        )

        model = AutoModelForSeq2SeqLM.from_pretrained(model_alias)
        self.model = get_peft_model(model, self.peft_config)
        

    def training_step(self, batch, batch_idx): 
        outputs = self.model.forward(**batch, return_dict=True)
        loss = outputs["loss"]  
        
        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)     
        return loss
    
    def validation_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        loss = outputs["loss"]  
        
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True) 
        
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=5e-4)
        return optimizer


model = PeftCALMT5(model_alias=alias, tokenizer_alias=alias)

In [9]:
trainer = pl.Trainer(
  max_epochs=1,
  devices=1, 
  accelerator="gpu",
  accumulate_grad_batches=3#Note we accumlate batches to effective form larger training batches 
)

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/zixian/.conda/envs/ctr-ai_training/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/gener

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/zixian/.conda/envs/ctr-ai_training/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/home/zixian/.conda/envs/ctr-ai_training/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/zixian/.conda/envs/ctr-ai_training/lib/python3.8/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [21]:
ft_preds = []
test_dl = DataLoader(
    test_dataset, 
    batch_size=16,
    pin_memory=True,
    shuffle=False,
    collate_fn=collator
)
source = []
for batch in tqdm.tqdm(test_dl, total = len(test_dl)):
    preds = model.model.generate(**batch, max_new_tokens=256)
    outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
    outputs_source = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)

    ft_preds.extend(outputs)
    source.extend(outputs_source)
# ft_preds[:]
print(source[:])
print("*"*20)
print(ft_preds[:])
# source['labels']

  0%|          | 0/1 [00:00<?, ?it/s]

["'per_exp': [' recently been diagnosed and just finishing my 3rd flare (2nd in 5 weeks) nI have been prescribed 100mg of Allo', 'nI am currently at the stage where my big toe just feels bruised which lasted around another 2 weeks after i considered my last flare finished last time.'], 'claim_per_exp': [], 'question': [' when is best to start it', 'nDo I need to wait until its completely gone?'], ", '\'per_exp\': [\'Cough induced Lower Oblique Hematoma - who knew!!!!!\', "nI had a pretty decent exacerbation the first of the month. During a coughing fit and felt a hot sharp pain in my lower right side. It gradually went away, then during airway clearance this last weekend it happened again only this time with a vengeance. a couple days later I noticed a HUGE deep black bruise on my side. Google Cough Induced Lower Oblique Hematoma', '\'per_exp\': [" my doctor mentioned recently that compression shorts also could help, and I\'m a little bit skeptical because I always thought it needed to