In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, get_peft_model, PeftModel
import pandas as pd
import torch
import spacy
from torch.utils.data import Dataset
import evaluate
import numpy as np

In [5]:
# Load the model and tokenizer
model_name = "potsawee/t5-large-generation-squad-QuestionAnswer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [6]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Fine-tune attention layers
    lora_dropout=0.2,
    bias="none"
)

In [7]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [8]:
# Check trainable parameters
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 740,027,392 || trainable%: 0.3188


In [10]:
# Load dataset using pandas
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}
train_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
validation_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["validation"])

In [12]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz (125.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.1.0,>=3.0.1 (from en_core_sci_md==0.4.0)
  Downloading spacy-3.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting thinc<8.1.0,>=8.0.3 (from spacy<3.1.0,>=3.0.1->en_core_sci_md==0.4.0)
  Downloading thinc-8.0.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wasabi<1.1.0,>=0.8.1 (from spacy<3.1.0,>=3.0.1->en_core_sci_md==0.4.0)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)
Collecting typer<0.4.0,>=0.3.0 (from spacy<3.1.0,>=3.0.1->en_core_sci_md==0.4.0)
  Downloading typer-0.

In [11]:
# Load SciSpacy model
nlp = spacy.load("en_core_sci_md")

def preprocess_with_ner_pos(df):
    # Check if required columns are present
    required_columns = {"support", "question", "correct_answer"}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"The DataFrame must contain the following columns: {required_columns}")

    inputs = []
    targets = []

    for idx, row in df.iterrows():
        # Original input and target
        support_text = row["support"]
        question = row["question"]
        correct_answer = row["correct_answer"]

        # Perform NER and POS tagging
        doc = nlp(support_text)
        tokens = [token.text for token in doc]
        pos_tags = [token.pos_ for token in doc]

        # Align NER tags with tokens
        ner_tags = ["O"] * len(doc)
        for ent in doc.ents:
            for i in range(ent.start, ent.end):
                ner_tags[i] = ent.label_

        # Format the input with NER and POS tags
        annotated_support = (
            f"{support_text} "
            f"[NER] {' '.join(ner_tags)} "
            f"[POS] {' '.join(pos_tags)}"
        )

        # Append to inputs and targets
        inputs.append(annotated_support)
        targets.append(f"{question} <sep> {correct_answer}")

    # Tokenize inputs and targets
    model_inputs = tokenizer(
        inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt"
    )
    labels = tokenizer(
        targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

OSError: [E050] Can't find model 'en_core_sci_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
train_data = preprocess_with_ner_pos(train_df)
validation_data = preprocess_with_ner_pos(validation_df)

In [None]:
# Convert to PyTorch dataset
class SciQDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}

In [None]:
train_dataset = SciQDataset(train_data)
validation_dataset = SciQDataset(validation_data)

In [None]:
# Load metrics
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects line-separated text
    decoded_preds = ["\n".join(pred.split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.split()) for label in decoded_labels]

    # Compute metrics
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Extract a single value for each metric
    result = {
        "rouge1": rouge_result["rouge1"].mid.fmeasure,
        "rouge2": rouge_result["rouge2"].mid.fmeasure,
        "rougeL": rouge_result["rougeL"].mid.fmeasure
    }
    return result

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_sciq",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    eval_accumulation_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",  # Use one of the computed metrics
    greater_is_better=True,          # Ensure this aligns with the chosen metric
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
# Train the model
trainer.train()

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss




ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['what', 'is', 'the', ..., 'the', 'the', 'the'],
Input references: [['who', 'proposed', 'the', 'theory', 'of', 'evolution', 'by', 'natural', 'selection?', '<', 'sep', '>', 'darwin']]

In [None]:
model.save_pretrained("./t5_squad_ner-ft_sciq")
tokenizer.save_pretrained("./t5_squad_ner-ft_sciq")

In [None]:
!zip -r t5_finetuned_sciq.zip ./t5_finetuned_sciq

In [None]:
from google.colab import files
files.download("t5_squad_ner-ft_sciq.zip")