In [None]:
pip install transformers==4.50

In [None]:
import torch
from transformers import (
    PegasusXConfig, 
    PegasusXForConditionalGeneration, 
    AutoTokenizer,
    TrainingArguments, 
    Trainer, 
    DataCollatorForSeq2Seq
)
from datasets import load_dataset, concatenate_datasets

# Determine device availability 
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# Set the activation function to test
activation = "gelu"
print(f"Using activation function: {activation}")

In [None]:
# Configure PEGASUS-X model with specified activation function
config = PegasusXConfig(
    max_position_embeddings=16384,         # Maximum sequence length for long documents
    activation_function=activation         # Dynamic activation function selection
)
print(f"Model configuration: max_position_embeddings={config.max_position_embeddings}, activation_function={config.activation_function}")

# Fine Tuning

In [None]:
# Define paths to parquet files
path1 = "/kaggle/input/bigpatent/train_1.parquet"
path2 = "/kaggle/input/bigpatent/train_2.parquet"

# Load each parquet file individually
ds1 = load_dataset('parquet', data_files=path1)
ds2 = load_dataset('parquet', data_files=path2)

# Combine the datasets
ds = concatenate_datasets([ds1['train'], ds2['train']])

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-base")

def preprocess_function(examples):
    """
    Preprocess and tokenize descriptions and their corresponding abstracts.
    """
    # Tokenize the descriptions
    model_inputs = tokenizer(
        examples['description'],
        max_length=256,
        truncation=True,
    )

    # Tokenize the abstracts
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['abstract'],
            max_length=256,
            truncation=True,
        )

    # Add the labels to the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the entire dataset
tokenized_dataset = ds.map(preprocess_function, batched=True)
train_dataset_split = tokenized_dataset['train']

In [None]:
# Load the model
model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base",config=config, ignore_mismatched_sizes=True).to(device)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",            # The output directory
    per_device_train_batch_size=8,     # Batch size for training
    num_train_epochs=3,                # Number of training epochs
    weight_decay=0.01,                 # Weight decay for regularization
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,
    save_strategy="epoch",
    report_to="tensorboard" 
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_split,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
# Start the fine-tuning process
print("Starting model training...")
trainer.train()
print("Training completed!")

In [None]:
!mkdir -p results

In [None]:
!rm -r results

In [None]:
# Save the fine-tuned model
output_dir = f"/kaggle/working/{activation}"
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")