In [None]:
!pip install transformers sentencepiece datasets

In [None]:
# Read data from training sesson

import json

# Define the path to your JSON file
json_file_path = '/content/training.json'

try:
    with open(json_file_path, 'r') as f:
        train_data = json.load(f)
    print("Data loaded successfully from:", json_file_path)
    # Display the loaded data. For large JSONs, you might want to print a subset.
    print(train_data)
except FileNotFoundError:
    print(f"Error: The file '{json_file_path}' was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from '{json_file_path}'. Check file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
# Convert to dataset
from datasets import Dataset
train_dataset = Dataset.from_list(train_data)

In [None]:
# Preporcess data
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess(batch):
    model_inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=64)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess, batched=True)

In [None]:
# Load model and trainer
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./qg_model",
    per_device_train_batch_size=4,
    num_train_epochs=4,
    logging_steps=10,
    save_steps=200,
    evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

In [None]:
# Train the model
trainer.train()


In [None]:
# Save model
model.save_pretrained("qg_t5_model")
tokenizer.save_pretrained("qg_t5_model")

In [None]:
# Test / Generate question
def generate_question(paragraph):
    text = "generate question: " + paragraph
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=256, truncation=True)

    output = model.generate(
        inputs,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test example
paragraph = "The cheetah is the fastest land animal and can run at speeds up to 120 km/h."
print(generate_question(paragraph))