In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, PeftModel
import pandas as pd
import torch

# Load the model and tokenizer
model_name = "potsawee/t5-large-generation-squad-QuestionAnswer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Fine-tune attention layers
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()

# Load dataset using pandas
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}
train_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
validation_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["validation"])

# Tokenize dataset
def preprocess_function(df):
    inputs = df["support"].tolist()
    targets = [q + " <sep> " + a for q, a in zip(df["question"], df["correct_answer"])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_data = preprocess_function(train_df)
validation_data = preprocess_function(validation_df)

# Convert to PyTorch dataset
from torch.utils.data import Dataset

class SciQDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = SciQDataset(train_data)
validation_dataset = SciQDataset(validation_data)

# Define training arguments
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_sciq",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=False,
    report_to="none"
)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)


  from .autonotebook import tqdm as notebook_tqdm


trainable params: 4,718,592 || all params: 742,386,688 || trainable%: 0.6356


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1747,No log
2,0.1745,No log
3,0.1582,No log


TrainOutput(global_step=8760, training_loss=0.21424704109697038, metrics={'train_runtime': 14030.0211, 'train_samples_per_second': 2.497, 'train_steps_per_second': 0.624, 'total_flos': 7.636464145583309e+16, 'train_loss': 0.21424704109697038, 'epoch': 3.0})

In [None]:
model.save_pretrained("./t5_squad_finetuned_sciq")
tokenizer.save_pretrained("./t5_squad_finetuned_sciq")

('./t5_finetuned_sciq/tokenizer_config.json',
 './t5_finetuned_sciq/special_tokens_map.json',
 './t5_finetuned_sciq/spiece.model',
 './t5_finetuned_sciq/added_tokens.json',
 './t5_finetuned_sciq/tokenizer.json')

In [None]:
!zip -r t5_finetuned_sciq.zip ./t5_finetuned_sciq

  adding: t5_finetuned_sciq/ (stored 0%)
  adding: t5_finetuned_sciq/tokenizer_config.json (deflated 95%)
  adding: t5_finetuned_sciq/special_tokens_map.json (deflated 86%)
  adding: t5_finetuned_sciq/tokenizer.json (deflated 74%)
  adding: t5_finetuned_sciq/adapter_config.json (deflated 52%)
  adding: t5_finetuned_sciq/README.md (deflated 66%)
  adding: t5_finetuned_sciq/adapter_model.safetensors (deflated 7%)
  adding: t5_finetuned_sciq/added_tokens.json (stored 0%)
  adding: t5_finetuned_sciq/spiece.model (deflated 48%)


In [None]:
from google.colab import files
files.download("t5_squad_finetuned_sciq.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model and tokenizer
model_dir = "./t5_squad_finetuned_sciq"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [3]:
# Load test data
test_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["test"])

# Tokenize the test dataset
test_data = preprocess_function(test_df)

# Convert to PyTorch dataset
test_dataset = SciQDataset(test_data)

In [4]:
from torch.utils.data import DataLoader

# Define test DataLoader
test_loader = DataLoader(test_dataset, batch_size=4)

# Generate predictions
model.eval()
predictions = []
references = []

for batch in test_loader:
    input_ids = batch["input_ids"].to("cuda") if torch.cuda.is_available() else batch["input_ids"]
    attention_mask = batch["attention_mask"].to("cuda") if torch.cuda.is_available() else batch["attention_mask"]
    
    with torch.no_grad():
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=128)
    
    predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    references.extend(tokenizer.batch_decode(batch["labels"], skip_special_tokens=True))

# Compare predictions with references
for pred, ref in zip(predictions[:5], references[:5]):
    print(f"Prediction: {pred}")
    print(f"Reference: {ref}\n")

Prediction: Compounds that are capable of accepting electrons, such as o 2 or f2, are called what?oxidants
Reference: Compounds that are capable of accepting electrons, such as o 2 or f2, are called what? oxidants

Prediction: What is the term for a genetically exact copy of an organism? cloning
Reference: What term in biotechnology means a genetically exact copy of an organism? clone

Prediction: Vertebrata are characterized by the presence of a what? backbone
Reference: Vertebrata are characterized by the presence of what? backbone

Prediction: What is the height above or below sea level? elevation
Reference: What is the height above or below sea level called? elevation

Prediction: Tree rings, ice cores, and varves indicate the environmental conditions at the time they were made. environmental conditions
Reference: Ice cores, varves and what else indicate the environmental conditions at the time of their creation? tree rings



In [13]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Compute the metric
results = rouge.compute(predictions=predictions, references=references)

print(results)

{'rouge1': 0.5156088212587002, 'rouge2': 0.3121115526052973, 'rougeL': 0.4490558848933188, 'rougeLsum': 0.4487582680075713}


In [4]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example function to extract NER/POS features
def add_ner_pos_tags(text):
    doc = nlp(text)
    ner_tags = [f"{ent.text} ({ent.label_})" for ent in doc.ents]
    pos_tags = [f"{token.text} ({token.pos_})" for token in doc]
    return " ".join(ner_tags), " ".join(pos_tags)

# Apply to dataset
test_df["ner_tags"], test_df["pos_tags"] = zip(*test_df["support"].apply(add_ner_pos_tags))

# Updated input combining NER/POS
test_df["augmented_support"] = test_df.apply(
    lambda row: row["support"] + " " + row["ner_tags"] + " " + row["pos_tags"], axis=1
)