In [None]:
pip install transformers==4.50

In [None]:
import torch
from transformers import (
    PegasusXConfig, 
    PegasusXForConditionalGeneration, 
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments, 
    Trainer, 
    DataCollatorForSeq2Seq,
    pipeline
)
from datasets import load_dataset, concatenate_datasets

# Determine device availability (GPU or CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# Set the activation function to test
activation = "relu"
print(f"Using activation function: {activation}")

In [None]:
# Configure PEGASUS-X model with specified activation function
config = PegasusXConfig(
    max_position_embeddings=12288,         # Maximum sequence length for government reports
    activation_function=activation         # Dynamic activation function selection
)
print(f"Model configuration: max_position_embeddings={config.max_position_embeddings}, activation_function={config.activation_function}")

# Fine Tuning

In [None]:
# Define paths to parquet files
path1 = "/kaggle/input/govreport/1.parquet"
path2 = "/kaggle/input/govreport/2.parquet"

# Load each Parquet file individually
ds1 = load_dataset('parquet', data_files=path1)
ds2 = load_dataset('parquet', data_files=path2)

# Combine the datasets
combined_dataset = concatenate_datasets([ds1['train'], ds2['train']])

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-base")

def preprocess_function(examples):
    """
    Preprocess and tokenize government reports and their corresponding summaries.
    """
    # Tokenize the reports (input)
    model_inputs = tokenizer(
        examples['report'],
        max_length=1024,
        truncation=True,
    )

    # Tokenize the summaries (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['summary'],
            max_length=1024,
            truncation=True,
        )

    # Add the labels to the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the entire dataset
tokenized_dataset = ds1.map(preprocess_function, batched=True)
train_dataset_split = tokenized_dataset['train']

In [None]:
# Load the model
model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base").to(device)

# Initialize data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for model checkpoints
    per_device_train_batch_size=1,        # Batch size per device during training
    num_train_epochs=3,                   # Total number of training epochs
    weight_decay=0.01,                    # Weight decay for regularization
    logging_dir='./logs',                 # Directory for storing logs
    logging_steps=10,                     # Log every N steps
    save_strategy="epoch",                # Save checkpoint at the end of each epoch
    report_to="tensorboard"               # Report metrics to TensorBoard
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_split,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
# Start the fine-tuning process
print("Starting model training...")
trainer.train()
print("Training completed!")

In [None]:
# Save the fine-tuned model
output_dir = "/kaggle/working/relu"
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")