In [1]:
!pwd

/home/sagemaker-user/Text-Summarizer/notebooks


In [2]:
%cd ..

/home/sagemaker-user/Text-Summarizer


# Configuration

In [3]:
from dataclasses import dataclass

In [4]:
@dataclass
class TrainerConfig:
    config_path: str
    config_model_name: str
    config_tokenizer_name: str
    config_tokenized_data_path: str
    params_epochs: int
    params_warmup_steps: int
    params_batch_size: int
    params_weight_decay: float
    params_logging_steps: int
    params_evaluation_strategy: str
    params_eval_steps: int
    params_save_steps: int
    params_gradient_accumulation_steps: int

# Configuration Manager

In [5]:
from src.config import ConfigManager
from src.utils import create_dirs
from os.path import join

In [6]:
class ConfigManager(ConfigManager):
    def get_trainer_config(self):
        config = self.config.fine_tuning
        tokenization_config = self.config.tokenization
        params = self.params.TrainingArgs

        direct = join(self.artifacts, config.folder)

        create_dirs([direct])

        return TrainerConfig(
            config_path=direct,
            config_model_name=config.model_name,
            config_tokenizer_name=tokenization_config.model_name,
            config_tokenized_data_path=join(self.artifacts, tokenization_config.folder),
            params_epochs=params.epochs,
            params_warmup_steps=params.warmup_steps,
            params_batch_size=params.batch_size,
            params_weight_decay=params.weight_decay,
            params_logging_steps=params.logging_steps,
            params_evaluation_strategy=params.evaluation_strategy,
            params_eval_steps=params.eval_steps,
            params_save_steps=params.save_steps,
            params_gradient_accumulation_steps=params.gradient_accumulation_steps
        )

# Function

In [7]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
import torch

[2024-10-08 19:39:59,089: INFO: config: PyTorch version 2.3.1.post300 available.]
[2024-10-08 19:39:59,091: INFO: config: TensorFlow version 2.17.0 available.]


2024-10-08 19:40:03.392873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 19:40:03.405784: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 19:40:03.409873: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 19:40:03.420348: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
class Model:
    def __init__(self, config: TrainerConfig):
        self.config = config

    def fit(self):
        tokenized_data = load_from_disk(self.config.config_tokenized_data_path)
        tokenizer = AutoTokenizer.from_pretrained(self.config.config_tokenizer_name)

        device = 'cuda' if torch.cuda.is_available() else "cpu"
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.config_model_name).to(device)
        
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        
        args = TrainingArguments(
            output_dir=self.config.config_path,
            num_train_epochs=self.config.params_epochs,
            warmup_steps=self.config.params_warmup_steps,
            per_device_train_batch_size=self.config.params_batch_size,
            per_device_eval_batch_size=self.config.params_batch_size,
            weight_decay=self.config.params_weight_decay,
            logging_steps=self.config.params_logging_steps,
            evaluation_strategy=self.config.params_evaluation_strategy,
            eval_steps=self.config.params_eval_steps,
            save_steps=int(self.config.params_save_steps),
            gradient_accumulation_steps=self.config.params_gradient_accumulation_steps
        )


        model = Trainer(
            model=model, args=args,
            tokenizer=tokenizer, data_collator=data_collator,
            train_dataset=tokenized_data["train"], 
            eval_dataset=tokenized_data["validation"]
        )

        model.train()

        model.save_model(join(self.config.config_path, 'model'))
        tokenizer.save_pretrained(join(self.config.config_path, 'tokenizer'))

# Run the step

In [9]:
try:
    config = ConfigManager().get_trainer_config()
    Model(config=config).fit()
except Exception as e:
    raise e

[2024-10-08 19:40:17,383: INFO: utils: The file: params.yaml loaded successfully...]
[2024-10-08 19:40:17,385: INFO: utils: The file: config.yaml loaded successfully...]


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
200,1.8965,1.67744
400,1.7395,1.536679
600,1.6701,1.456872
800,1.6472,1.430026


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
