In [None]:
pip install transformers==4.50

In [None]:
import torch
from transformers import PegasusXConfig, PegasusXForConditionalGeneration, pipeline, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
config = PegasusXConfig(max_position_embeddings = 512, activation_function="silu")

# Fine Tuning

In [None]:
from datasets import load_dataset

ds_path="/kaggle/input/cnndailymail/train.parquet"
ds = load_dataset('parquet', data_files=ds_path)


In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-base")

# Define the preprocessing function
def preprocess_function(examples):
    # Tokenize the reports (input)
    model_inputs = tokenizer(
        examples['article'],
        max_length=128,
        truncation=True,
    )

    # Tokenize the summaries (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['highlights'],
            max_length=128,
            truncation=True,
        )

    # Add the labels to the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the entire dataset
#tokenized_dataset = combined_dataset.map(preprocess_function, batched=True)
tokenized_dataset = ds.map(preprocess_function, batched=True)
train_dataset_split = tokenized_dataset['train']

In [None]:
import torch
from transformers import PegasusXForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Load the model
model = PegasusXForConditionalGeneration.from_pretrained("google/pegasus-x-base",config=config, ignore_mismatched_sizes=True).to(device)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",            # The output directory
    per_device_train_batch_size=1,     # Batch size for training
    num_train_epochs=3,                # Number of training epochs
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,
    save_strategy="epoch",
    report_to="tensorboard" # Recommended for more advanced visualization
)





In [None]:
train_dataset_split = train_dataset_split.select(range(16000))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_split,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
# Start fine-tuning!
trainer.train()

In [None]:
!mkdir -p results

In [None]:
!rm -r results

In [None]:
# Assuming 'trainer' is your Hugging Face Trainer object
output_dir = "/kaggle/working/silu"
trainer.save_model(output_dir)

# Testing

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# model_ckpt = "/kaggle/working/silu/"
# tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

# pipe = pipeline('summarization', model = model_ckpt)
# pipe_out = pipe(test_text)
# print(pipe_out)