In [1]:
import os

In [2]:
%pwd

'd:\\Aiprojects\\Textsummarization\\text-summarization\\Notebooks'

In [3]:
os.chdir('../')
%pwd

'd:\\Aiprojects\\Textsummarization\\text-summarization'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    #config 
    root_dir: Path
    data_path: Path
    model_ckpt: str
    #params
    num_train_epochs: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    learning_rate: float
    warmup_steps: int
    weight_decay: float
    eval_strategy: str
    save_strategy: str
    save_total_limit: int
    logging_steps: int
    eval_steps: int
    load_best_model_at_end: bool
    metric_for_best_model: str
    gradient_accumulation_steps: int

In [5]:
from src.constants import * 
from src.utils import read_yaml,create_directory

In [6]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH , params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directory([self.config.artifacts_root])


    def get_model_trainer_config(self) ->ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingParameters

        create_directory([config.root_dir])
        config = ModelTrainerConfig(
                    root_dir = config.root_dir,
                    data_path = config.data_path,
                    model_ckpt = config.model_ckpt,
                    num_train_epochs = params.num_train_epochs,
                    per_device_train_batch_size = params.per_device_train_batch_size,
                    per_device_eval_batch_size = params.per_device_eval_batch_size,
                    learning_rate = params.learning_rate,
                    warmup_steps = params.warmup_steps,
                    weight_decay = params.weight_decay,
                    eval_strategy = params.eval_strategy,
                    save_strategy = params.save_strategy,
                    save_total_limit = params.save_total_limit,
                    logging_steps = params.logging_steps,
                    eval_steps = params.eval_steps,
                    load_best_model_at_end = params.load_best_model_at_end,
                    metric_for_best_model = params.metric_for_best_model,
                    gradient_accumulation_steps = params.gradient_accumulation_steps,
                )
        return config



In [7]:
import torch
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM , Trainer , TrainingArguments , DataCollatorForSeq2Seq
from datasets import load_from_disk
from src.logging.logger import get_logger
from src.Exception import CustomException
import sys
from dataclasses import asdict
from src.utils import get_safe_batch_size
logging = get_logger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class ModelTrainer:
    def __init__(self,config:ModelTrainerConfig):

        self.config = config


    def train(self):
        device = 'cuda' if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer , model = model_pegasus)
        dataset = load_from_disk(self.config.data_path)

        dic_debug = asdict(self.config)

        for key, value in dic_debug.items():
            print(key, value, type(value))

        training_args = TrainingArguments(
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    learning_rate = self.config.learning_rate,
    warmup_steps = self.config.warmup_steps,
    weight_decay = self.config.weight_decay,
    eval_strategy = self.config.eval_strategy,
    save_strategy = self.config.save_strategy,
    save_total_limit = self.config.save_total_limit,
    logging_steps = self.config.logging_steps,
    eval_steps = self.config.eval_steps,
    load_best_model_at_end = self.config.load_best_model_at_end,
    metric_for_best_model = self.config.metric_for_best_model,
    gradient_accumulation_steps = self.config.gradient_accumulation_steps,
    num_train_epochs = self.config.num_train_epochs,
    output_dir = self.config.root_dir,
    fp16=True
)

        trainer = Trainer(model = model_pegasus , args = training_args ,
                           tokenizer = tokenizer , data_collator=data_collator ,
                           train_dataset=dataset["train"],
                           eval_dataset=dataset["validation"]
                           )
        trainer.train()
        logging.info("model training finished")
        model_pegasus.save_pretrained(os.path.join(self.config.root_dir,"pegasus-samsum-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"tokenizer"))
        logging.info("model and tokenizer saved")
        


In [9]:
try : 
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e :
    error = CustomException(e,sys)
    logging.error(error)
    raise error

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model = model_pegasus , args = training_args ,
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


root_dir artifacts/model_trainer <class 'str'>
data_path artifacts/data_transformation/samsum_dataset_transformed <class 'str'>
model_ckpt google/pegasus-cnn_dailymail <class 'str'>
num_train_epochs 1 <class 'int'>
per_device_train_batch_size 1 <class 'int'>
per_device_eval_batch_size 1 <class 'int'>
learning_rate 0.0001 <class 'float'>
warmup_steps 200 <class 'int'>
weight_decay 0.01 <class 'float'>
eval_strategy steps <class 'str'>
save_strategy steps <class 'str'>
save_total_limit 3 <class 'int'>
logging_steps 10 <class 'int'>
eval_steps 100 <class 'int'>
load_best_model_at_end True <class 'bool'>
metric_for_best_model rouge2 <class 'str'>
gradient_accumulation_steps 16 <class 'int'>


Step,Training Loss,Validation Loss
100,1.9226,1.648633


CustomException: Error in [C:\Users\LOQ\AppData\Local\Temp\ipykernel_23336\1696899842.py] , line : [5] : "The `metric_for_best_model` training argument is set to 'eval_rouge2', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments." 