In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, pipeline
import torch
import os

In [None]:
import yaml
import logging
from datetime import datetime

# YAML config
try:
    with open(r".\config.yaml", "r") as f:
        config = yaml.safe_load(f)
except Exception as e:
    raise

# Logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s",
    filename=config["log_dir"] +
    f"{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log",
    filemode="w"
)
logger = logging.getLogger(__name__)

logger.info("Config file and logger setup completed.")

In [None]:
def initialize_model_tokenizer():
    """
    Returns model and tokenizer.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(config["base_model"])
        model = AutoModelForSeq2SeqLM.from_pretrained(config["base_model"]).to(
            torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        return tokenizer, model
    except Exception as e:
        logger.error(f"Failed to model and tokenizer creation failed: {e}")
        raise

In [None]:
tokenizer, model = initialize_model_tokenizer()

In [None]:
def load_and_prepare_data(train_csv, validation_csv, test_csv):
    """
    Loads, preprocesses, and prepares the SAMSum datasets for training and evaluation.

    Args:
        train_csv (str): Path to the training CSV file.
        validation_csv (str): Path to the validation CSV file.
        test_csv (str): Path to the test CSV file.

    Returns:
        DatasetDict: A DatasetDict containing training, validation, and test datasets.
    """
    try:
        df_train = pd.read_csv(train_csv).dropna()
        df_validation = pd.read_csv(validation_csv).dropna()
        df_test = pd.read_csv(test_csv).dropna()
        logger.info(
            f"Data loaded successfully from: {train_csv}, {validation_csv}, {test_csv}")
        return DatasetDict({
            "train": Dataset.from_pandas(df_train, preserve_index=False),
            "test": Dataset.from_pandas(df_test, preserve_index=False),
            "validation": Dataset.from_pandas(df_validation, preserve_index=False)
        })
    except FileNotFoundError as e:
        logger.error(f"File not found: {e}")
        raise
    except pd.errors.EmptyDataError as e:
        logger.warning(f"Empty CSV file encountered: {e}")
        return DatasetDict({"train": Dataset.Empty(), "test": Dataset.Empty(), "validation": Dataset.Empty()})
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        raise

In [None]:
def tokenize_function(batch):
    """
    Tokenizes the dialogue and summary using the tokenizer.

    Args:
        batch (pd.DataFrame): A batch of dialogues and summaries.

    Returns:
        dict: A dictionary containing the tokenized input and target tensors.
    """
    try:
        encoding = tokenizer(batch["dialogue"], text_target=batch["summary"],
                             max_length=200, truncation=True, padding=True, return_tensors="pt")
        return encoding
    except Exception as e:
        logger.error(f"Error during tokenization: {e}")
        raise

In [None]:
def train_model(train_dataset, validation_dataset, model_name):
    """
    Trains the summarization model using the transformers library.

    Args:
        train_dataset (Dataset): The training dataset.
        validation_dataset (Dataset): The validation dataset.
        model_name (str): The name of the pre-trained model to use.

    Returns:
        Trainer: The trained Trainer object.
    """
    try:
        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        args = TrainingArguments(
            output_dir=config["output_dir"],
            # num_train_epochs=2,
            max_steps=1,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            eval_strategy="epoch",
            save_strategy="epoch",
            weight_decay=0.01,
            learning_rate=2e-5,
            gradient_accumulation_steps=500,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=args,
            tokenizer=tokenizer,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=validation_dataset
        )

        trainer.train()
        logger.info("Training complete!")
        return trainer
    except Exception as e:
        logger.error(f"Error during training: {e}")
        raise

In [None]:
def save_model(trainer, model_name):
    """
    Saves the trained model.

    Args:
        trainer (Trainer): The trained Trainer object.
        model_name (str): The desired name for the saved model.
    """
    try:
        trainer.save_model(model_name)
        logger.info(f"Model saved to {model_name}")
    except Exception as e:
        logger.error(f"Error saving model: {e}")
        raise

In [None]:
def summarize(dialogue, model):
    """
    Summarizes a given dialogue using the trained model.

    Args:
        dialogue (str): The dialogue to be summarized.
        model: The summarization pipeline.

    Returns:
        str: The summarized text.
    """
    try:
        pipe = pipeline("summarization", model=model)
        output = pipe(dialogue)
        return output[0]["summary_text"]
    except Exception as e:
        logger.error(f"Error during summarization: {e}")
        raise

In [None]:
train_csv = "path"
validation_csv = "path"
test_csv = "path"

In [None]:
dataset = load_and_prepare_data(train_csv, validation_csv, test_csv)

In [None]:
dataset_encoded = dataset.map(tokenize_function, batched=True, batch_size=None)

In [None]:
trainer = train_model(
    dataset_encoded["train"], dataset_encoded["validation"], config["base_model"])
save_model(trainer, "sum")

In [None]:
custom_dialogue = "demo"
summary = summarize(custom_dialogue, "sum")