In [1]:
pip install peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model
import pandas as pd
import nltk

In [3]:
model_name = "potsawee/t5-large-generation-squad-QuestionAnswer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Fine-tune attention layers
    lora_dropout=0.1,
    bias="none"
)

tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [4]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 742,386,688 || trainable%: 0.6356


In [5]:
model.config.ignore_pad_token_for_loss = True

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

In [6]:
# Load dataset using pandas
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}
train_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
validation_df = pd.read_parquet(
    "hf://datasets/allenai/sciq/" + splits["validation"])

In [7]:
# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [8]:
def preprocess_function_with_nltk(df):
    inputs = []
    count = 1
    for text in df["support"].tolist():
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        pos_str = " ".join([f"{word}/{pos}" for word, pos in pos_tags])
        ner_tree = nltk.ne_chunk(pos_tags)
        ner_tags = " ".join([f"{' '.join(c[0] for c in subtree)}({subtree.label()})" if isinstance(subtree, nltk.Tree) else f"{subtree[0]}" for subtree in ner_tree])
        enriched_input = f"{text}\nNER: {ner_tags}\nPOS: {pos_str}"
        if count<=5:
            print(enriched_input)
            print('_'*40)
            count+=1
        inputs.append(enriched_input)
    
    targets = [q + " <sep> " + a for q, a in zip(df["question"], df["correct_answer"])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [9]:
train_data = preprocess_function_with_nltk(train_df)
validation_data = preprocess_function_with_nltk(validation_df)

Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.
NER: Mesophiles grow best in moderate temperature , typically between 25°C and 40°C ( 77°F and 104°F ) . Mesophiles are often found living in or on the bodies of humans or other animals . The optimal growth temperature of many pathogenic mesophiles is 37°C ( 98°F ) , the normal human body temperature . Mesophilic(ORGANIZATION) organisms have important uses in food preparation , including cheese , yogurt , beer and wine .
POS: Mesophiles/NNS grow/VBP best/JJS in/IN moderate/JJ temperature/NN ,/, typically/RB between/IN 25°C/CD and/CC 40°C/CD (/( 77°F/CD and/CC 104°F/CD )/) ./. Mesophiles/NNS are/VBP o

In [10]:
class SciQDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.encodings["labels"][idx]),
        }

train_dataset = SciQDataset(train_data)
validation_dataset = SciQDataset(validation_data)

In [11]:
# Print a sample from the dataset
print("Sample from train_dataset:", train_dataset[0])

Sample from train_dataset: {'input_ids': tensor([10162, 21144,    15,     7,  1604,   200,    16,  8107,  2912,     6,
         3115,   344,   944,  1956,   254,    11,  1283,  1956,   254,    41,
         4013,  1956,   371,    11,     3, 15442,  1956,   371,   137, 10162,
        21144,    15,     7,    33,   557,   435,   840,    16,    42,    30,
            8,  5678,    13,  6917,    42,   119,  3127,     5,    37,  6624,
         1170,  2912,    13,   186,  2071, 20853,   140,     7, 21144,    15,
            7,    19,  6862,  1956,   254,    41,  3916,  1956,   371,   201,
            8,  1389,   936,   643,  2912,     5, 10162, 21144,   447,  9329,
            7,    43,   359,  2284,    16,   542,  4537,     6,   379,  3285,
            6, 19168,     6,  6061,    11,  2013,     5,     3, 18206,    10,
        10162, 21144,    15,     7,  1604,   200,    16,  8107,  2912,     3,
            6,  3115,   344,   944,  1956,   254,    11,  1283,  1956,   254,
           41,     3,  

In [12]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_sciq",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    lr_scheduler_type="linear",
    warmup_steps=500,
    label_names=["labels"],
)



In [13]:
# Custom data collator to handle labels
class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features):
        batch = super().__call__(features)
        if "labels" in batch:
            batch["labels"] = torch.tensor(batch["labels"])
        return batch

data_collator = CustomDataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
)

  trainer = Seq2SeqTrainer(


In [15]:
# Debugging Data Collator
sample_batch = [train_dataset[i] for i in range(1)]
print("Sample batch:", sample_batch)
collated_batch = data_collator(sample_batch)
print("Collated batch:", collated_batch)

Sample batch: [{'input_ids': tensor([10162, 21144,    15,     7,  1604,   200,    16,  8107,  2912,     6,
         3115,   344,   944,  1956,   254,    11,  1283,  1956,   254,    41,
         4013,  1956,   371,    11,     3, 15442,  1956,   371,   137, 10162,
        21144,    15,     7,    33,   557,   435,   840,    16,    42,    30,
            8,  5678,    13,  6917,    42,   119,  3127,     5,    37,  6624,
         1170,  2912,    13,   186,  2071, 20853,   140,     7, 21144,    15,
            7,    19,  6862,  1956,   254,    41,  3916,  1956,   371,   201,
            8,  1389,   936,   643,  2912,     5, 10162, 21144,   447,  9329,
            7,    43,   359,  2284,    16,   542,  4537,     6,   379,  3285,
            6, 19168,     6,  6061,    11,  2013,     5,     3, 18206,    10,
        10162, 21144,    15,     7,  1604,   200,    16,  8107,  2912,     3,
            6,  3115,   344,   944,  1956,   254,    11,  1283,  1956,   254,
           41,     3,  4013,  1956,

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"])


In [16]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1797,0.174001
2,0.1681,0.16995
3,0.1511,0.168197
4,0.1452,0.167968
5,0.1467,0.167461
6,0.1297,0.168105


  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  batch["labels"] = torch.tensor(batch["labels"])
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=8760, training_loss=0.3485445708444674, metrics={'train_runtime': 21211.9745, 'train_samples_per_second': 3.304, 'train_steps_per_second': 0.413, 'total_flos': 1.5272928291166618e+17, 'train_loss': 0.3485445708444674, 'epoch': 6.0})

In [17]:
# Save the model and tokenizer
model.save_pretrained("./t5squad_ner-pos_finetuned_sciq")
tokenizer.save_pretrained("./t5squad_ner-pos_finetuned_sciq")

('./t5squad_ner-pos_finetuned_sciq/tokenizer_config.json',
 './t5squad_ner-pos_finetuned_sciq/special_tokens_map.json',
 './t5squad_ner-pos_finetuned_sciq/spiece.model',
 './t5squad_ner-pos_finetuned_sciq/added_tokens.json',
 './t5squad_ner-pos_finetuned_sciq/tokenizer.json')

In [18]:
# Zip the saved model
!zip -r t5squad_ner-pos_finetuned_sciq.zip ./t5squad_ner-pos_finetuned_sciq
from IPython.display import FileLink
FileLink(r't5squad_ner-pos_finetuned_sciq.zip')

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: t5squad_ner-pos_finetuned_sciq/ (stored 0%)
  adding: t5squad_ner-pos_finetuned_sciq/README.md (deflated 66%)
  adding: t5squad_ner-pos_finetuned_sciq/adapter_model.safetensors (deflated 7%)
  adding: t5squad_ner-pos_finetuned_sciq/tokenizer_config.json (deflated 95%)
  adding: t5squad_ner-pos_finetuned_sciq/adapter_config.json (deflated 53%)
  adding: t5squad_ner-pos_finetuned_sciq/special_tokens_map.json (deflated 86%)
  adding: t5squad_ner-pos_finetuned_sciq/added_tokens.json (stored 0%)
  adding: t5squad_ner-pos_finetuned_sciq/tokenizer.json (deflated 74%)
  adding: t5squad_ner-pos_finetuned_sciq/spiece.model (deflated 48%)
