In [1]:
!pip install datasets



In [2]:
!pip install transformers==4.30.0

Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
     ---------------------------------------- 0.0/113.6 kB ? eta -:--:--
     --- ------------------------------------ 10.2/113.6 kB ? eta -:--:--
     --- ------------------------------------ 10.2/113.6 kB ? eta -:--:--
     ------ ------------------------------ 20.5/113.6 kB 129.6 kB/s eta 0:00:01
     ---------- -------------------------- 30.7/113.6 kB 145.2 kB/s eta 0:00:01
     ------------- ----------------------- 41.0/113.6 kB 178.6 kB/s eta 0:00:01
     -------------------- ---------------- 61.4/113.6 kB 217.9 kB/s eta 0:00:01
     -------------------------- ---------- 81.9/113.6 kB 254.2 kB/s eta 0:00:01
     -----------------------------------  112.6/113.6 kB 311.2 kB/s eta 0:00:01
     ------------------------------------ 113.6/113.6 kB 300.4 kB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Downloading tokenizers-0.13.3.tar.gz (314 k

  error: subprocess-exited-with-error
  
  Building wheel for tokenizers (pyproject.toml) did not run successfully.
  exit code: 1
  
  [49 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-312\tokenizers
  copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers
  creating build\lib.win-amd64-cpython-312\tokenizers\models
  copying py_src\tokenizers\models\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\models
  creating build\lib.win-amd64-cpython-312\tokenizers\decoders
  copying py_src\tokenizers\decoders\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\decoders
  creating build\lib.win-amd64-cpython-312\tokenizers\normalizers
  copying py_src\tokenizers\normalizers\__init__.py -> build\lib.win-amd64-cpython-312\tokenizers\normalizers
  creating build\lib.win-amd64-cpython-312\tokenizers\pre_tokenizers
  copying py_src\tokenizers\pre_tokenizers\__init__.py -> build\lib.win-a

In [3]:
!pip install protobuf==3.20.3




In [4]:
import os
import sys
# import logging
import torch
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from datasets import load_from_disk
from typing import Optional




In [None]:
@dataclass
class ModelTrainerConfig:
    root_dir: str = "../artifacts/model_trainer"
    data_path: str = r"../artifacts/data_transformation/samsum_dataset"
    model_ckpt: str = "google/pegasus-cnn_dailymail"
    num_train_epochs: int = 1
    warmup_steps: int = 500
    per_device_train_batch_size: int = 1
    per_device_eval_batch_size: int = 1
    weight_decay: float = 0.01
    logging_steps: int = 10
    evaluation_strategy: str = "steps"
    eval_steps: int = 500
    save_steps: float = 1e6
    gradient_accumulation_steps: int = 16
    save_total_limit: Optional[int] = 2  # Limit the number of saved checkpoints

# Model Trainer Class
class ModelTrainer:
    def __init__(self):
        self.model_trainer = ModelTrainerConfig()

    def initiate_training(self):
        # try:
            # Set device to GPU if available, else fallback to CPU
            device = "cuda" if torch.cuda.is_available() else "cpu"

            # Load tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained(self.model_trainer.model_ckpt)
            model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(
                self.model_trainer.model_ckpt
            ).to(device)

            # Data collator for sequence-to-sequence tasks
            seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

            # Load preprocessed dataset
            print("Loading dataset...")
            dataset_samsum_pt = load_from_disk(self.model_trainer.data_path)
            print("Dataset loaded successfully.")

            # Define training arguments using the configuration
            trainer_args = TrainingArguments(
                output_dir=self.model_trainer.root_dir,
                num_train_epochs=self.model_trainer.num_train_epochs,
                warmup_steps=self.model_trainer.warmup_steps,
                per_device_train_batch_size=self.model_trainer.per_device_train_batch_size,
                per_device_eval_batch_size=self.model_trainer.per_device_eval_batch_size,
                weight_decay=self.model_trainer.weight_decay,
                logging_steps=self.model_trainer.logging_steps,
                evaluation_strategy=self.model_trainer.evaluation_strategy,
                eval_steps=self.model_trainer.eval_steps,
                save_steps=int(self.model_trainer.save_steps),
                gradient_accumulation_steps=self.model_trainer.gradient_accumulation_steps
                                ###########
                # ,
                # save_total_limit=self.model_trainer.save_total_limit,
                # report_to="none",  # Disable reporting to third-party services
                # load_best_model_at_end=True,
                # metric_for_best_model="eval_loss",
                # greater_is_better=False,
                # fp16=torch.cuda.is_available()  # Use mixed precision if GPU is available
            )

            # Initialize Trainer
            trainer = Trainer(
                model=model_pegasus,
                args=trainer_args,
                tokenizer=tokenizer,
                data_collator=seq2seq_data_collator,
                train_dataset=dataset_samsum_pt["train"],
                eval_dataset=dataset_samsum_pt["validation"]
            )

            # Train the model
            print("Starting training...")
            trainer.train()

            # Save the fine-tuned model and tokenizer
            model_save_path = os.path.join(self.model_trainer.root_dir, "pegasus-samsum-model")
            tokenizer_save_path = os.path.join(self.model_trainer.root_dir, "tokenizer")

            model_pegasus.save_pretrained(model_save_path)
            tokenizer.save_pretrained(tokenizer_save_path)

            torch.save(model_pegasus.state_dict(), os.path.join(model_save_path, "model_state_dict.bin"))

            print(f"Model and tokenizer saved to {model_save_path} and {tokenizer_save_path} respectively.")

        # except Exception as e:
        #     logging.error("Error during model training")
        #     raise e

if __name__ == "__main__":
    # Initialize ModelTrainer and load dataset
    model_trainer_obj = ModelTrainer()
    dataset = load_from_disk(model_trainer_obj.model_trainer.data_path)
    print("Dataset loading...")
    print(dataset["validation"])

    # Start training
    model_trainer_obj.initiate_training()

Dataset loading...
Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 818
})


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading dataset...
Dataset loaded successfully.


  trainer = Trainer(


Starting training...
