In [1]:
import os
%pwd


'g:\\PROJECT\\Text-Summarizer\\research'

In [2]:
os.chdir("../")

In [29]:
from dataclasses import dataclass
from pathlib import Path


@dataclass
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    gradient_accumulation_steps: int
    warmup_steps: int  
    learning_rate: float
    logging_steps: int
    fp16: str
    save_strategy: str
    report_to: str
    dataloader_num_workers: int
    


In [30]:
from src.TextSummarizer.constants import *
from src.TextSummarizer.utils.common import read_yaml, create_directories

In [31]:
class ConfigurationManager:
    def __init__(self,
                config_path=CONFIG_FILE_PATH,
                params_path=PARAMS_FILE_PATH):
        self.config= read_yaml(config_path)
        self.params=read_yaml(params_path)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self)->ModelTrainerConfig:
        config=self.config.model_trainer
        parmas=self.params.TrainingArguments

        model_trainer_config=ModelTrainerConfig(
            root_dir=Path(config.root_dir),
            data_path=Path(config.data_path),
            model_ckpt=config.model_ckpt,
            num_train_epochs=parmas.num_train_epochs,
            per_device_train_batch_size=parmas.per_device_train_batch_size,
            per_device_eval_batch_size=parmas.per_device_eval_batch_size,
            gradient_accumulation_steps=parmas.gradient_accumulation_steps,
            warmup_steps=parmas.warmup_steps,
            learning_rate=parmas.learning_rate,
            logging_steps=parmas.logging_steps,
            fp16=parmas.fp16,
            save_strategy=parmas.save_strategy,
            report_to=parmas.report_to,
            dataloader_num_workers=parmas.dataloader_num_workers
            
        )
        return model_trainer_config

In [32]:
import os
from src.TextSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#### MODEL TRAINER COMPONENT(FEATURE ENGINEERING)

In [33]:
class ModelTrainer():
    def __init__(self, config: ModelTrainerConfig):
        self.config=config

    def train(self):
        device="cuda" if torch.cuda.is_available() else "cpu"
        tokenizer=AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model2 = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator=DataCollatorForSeq2Seq(tokenizer , model=model2)

        #loading data
        ds=load_from_disk(self.config.data_path)
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=1,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=1,
            warmup_steps=20, # Was 128

            learning_rate=5e-5,
            logging_steps=50,

            fp16=True,

            save_strategy="no",
            report_to="none",
            dataloader_num_workers=0
            
        )
        trainer = Trainer(model=model2, args=trainer_args,
                    tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                    train_dataset=ds["test"],
                    eval_dataset=ds["validation"])
        trainer.train()

        model2.save_pretrained(os.path.join(self.config.root_dir, "pegasus-model"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer")) 


In [34]:
config=ConfigurationManager()
model_trainer_config=config.get_model_trainer_config()
model_trainer=ModelTrainer(model_trainer_config)
model_trainer.train()

[2025-12-10 17:33:42,051: INFO: common: yaml file config\config.yaml loaded successfully:]
[2025-12-10 17:33:42,065: INFO: common: yaml file params.yaml loaded successfully:]
[2025-12-10 17:33:42,067: INFO: common: created directory atartifacts:]


  trainer = Trainer(model=model2, args=trainer_args,
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss


KeyboardInterrupt: 