In [1]:
pip install fsspec==2024.9.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers datasets torch accelerate

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])

In [5]:
df.head()

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support
0,What type of organism is commonly used in prep...,viruses,protozoa,gymnosperms,mesophilic organisms,"Mesophiles grow best in moderate temperature, ..."
1,What phenomenon makes global winds blow northe...,tropical effect,muon effect,centrifugal effect,coriolis effect,Without Coriolis Effect the global winds would...
2,Changes from a less-ordered state to a more-or...,endothermic,unbalanced,reactive,exothermic,Summary Changes of state are examples of phase...
3,What is the least dangerous radioactive decay?,zeta decay,beta decay,gamma decay,alpha decay,All radioactive decay is dangerous to living t...
4,Kilauea in hawaii is the world’s most continuo...,magma,greenhouse gases,carbon and smog,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...


In [6]:
from datasets import load_dataset

# Load the SciQ dataset
sciq = load_dataset("sciq")

# Inspect the structure
print(sciq["train"][0])  # Example record

{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?', 'distractor3': 'viruses', 'distractor1': 'protozoa', 'distractor2': 'gymnosperms', 'correct_answer': 'mesophilic organisms', 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}


In [7]:
def preprocess_data(example):
    return {
        "input_text": f"Generate question and answer: {example['support']}",
        "target_text": f"{example['question']} <sep> {example['correct_answer']}"
    }

train_data = sciq["train"].map(preprocess_data, remove_columns=sciq["train"].column_names)
validation_data = sciq["validation"].map(preprocess_data, remove_columns=sciq["validation"].column_names)
test_data = sciq["test"].map(preprocess_data, remove_columns=sciq["test"].column_names)


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-race-QuestionAnswer")
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-race-QuestionAnswer")

In [9]:
def tokenize_data(batch):
    inputs = tokenizer(batch["input_text"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(batch["target_text"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_data.map(tokenize_data, batched=True)
tokenized_validation = validation_data.map(tokenize_data, batched=True)
tokenized_test = test_data.map(tokenize_data, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True)
validation_loader = DataLoader(tokenized_validation, batch_size=8)

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5_finetuned_sciq",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer
)




In [12]:
trainer.train()

  0%|          | 0/4380 [00:00<?, ?it/s]

: 

In [None]:
results = trainer.evaluate(tokenized_test)
print(results)

In [None]:
model.save_pretrained("./t5_finetuned_sciq")
tokenizer.save_pretrained("./t5_finetuned_sciq")

In [None]:
def generate_qa(context):
    input_text = f"Generate question and answer: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
    question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    question, answer = question_answer.split("<sep>")
    return question.strip(), answer.strip()

# Example context
new_context = "Photosynthesis is the process by which green plants use sunlight to synthesize foods with the help of chlorophyll."
question, answer = generate_qa(new_context)
print("Generated Question:", question)
print("Generated Answer:", answer)
