In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the datasets
train_data = pd.read_csv('samsum-train.csv')
test_data = pd.read_csv('samsum-test.csv')
val_data = pd.read_csv('samsum-validation.csv')

# Reduce dataset size
train_sample = train_data.sample(frac=0.2, random_state=42)  # Keep 30% of training data
test_sample = test_data.sample(frac=0.4, random_state=42)    # Keep 50% of test data
val_sample = val_data.sample(frac=0.4, random_state=42)      # Keep 50% of validation data

# Print new dataset sizes
# print(len(train_sample), len(test_sample), len(val_sample))

In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
def preprocess_data(example):
    # Convert None values to empty strings
    dialogue = example["dialogue"] if example["dialogue"] is not None else ""
    summary = example["summary"] if example["summary"] is not None else ""

    # Tokenize inputs and targets
    inputs = tokenizer("summarize: " + dialogue, padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(summary, padding="max_length", truncation=True, max_length=128)

    inputs["labels"] = targets["input_ids"]
    return inputs

In [5]:
train_dataset = Dataset.from_pandas(train_sample).map(preprocess_data)
val_dataset = Dataset.from_pandas(val_sample).map(preprocess_data)

Map: 100%|██████████| 2946/2946 [00:02<00:00, 1416.32 examples/s]
Map: 100%|██████████| 327/327 [00:00<00:00, 1477.53 examples/s]


In [6]:
import inspect
from transformers import TrainingArguments

print(inspect.signature(TrainingArguments.__init__))




In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="epoch",            # <-- use eval_strategy, NOT evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./commai_summarizer_latest")
tokenizer.save_pretrained("./commai_summarizer_latest")

('./commai_summarizer_latest\\tokenizer_config.json',
 './commai_summarizer_latest\\special_tokens_map.json',
 './commai_summarizer_latest\\spiece.model',
 './commai_summarizer_latest\\added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = "./commai_summarizer_latest"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [None]:
def commai_summary(text):
    input_text = "summarize: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).input_ids
    output_ids = model.generate(
    input_ids,
    max_length=150,  # Allows for more detail  
    min_length=80,  # Ensures it's longer  
    num_beams=5,  
    length_penalty=1.0,  # Keeps it balanced (not too short)  
    temperature=0.8,  # Adds slight variation  
    repetition_penalty=2.0,  # Avoids repeating phrases  
    no_repeat_ngram_size=4,  # Prevents redundancy  
    early_stopping=True  
    )
    
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    

In [None]:
conversation = """📝Hey! How are you doing? I was thinking about our last discussion...
Yeah, I get what you mean. It happens sometimes! I just felt a bit unsure about how to phrase my argument properly..."""


summary = commai_summary(conversation)
# print(summary)

I was thinking about our last discussion. I get what you mean. It happens sometimes. I just felt unsure about how to form my argument properly. But it happens sometimes. We are going to have a discussion on this topic and we will discuss it in the next few weeks. The answer is yes, but there's a lot of confusion. Let's talk about your argument.
