### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datasets import load_dataset

### A. Load Dataset

In [2]:
dataset = load_dataset("csv", data_files="data/data_prepared.csv")
train_test_split = dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

### B. Tokenize Data


In [3]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")


def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 2916/2916 [00:02<00:00, 1376.12 examples/s]
Map: 100%|██████████| 729/729 [00:00<00:00, 1440.60 examples/s]


### C. Fine-Tune T5

In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./t5-product-description",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,2.65,2.333401
1000,2.4711,2.244147
1500,2.3138,2.206188
2000,2.4979,2.191746


TrainOutput(global_step=2187, training_loss=2.5726236658222774, metrics={'train_runtime': 5226.8001, 'train_samples_per_second': 1.674, 'train_steps_per_second': 0.418, 'total_flos': 295992519819264.0, 'train_loss': 2.5726236658222774, 'epoch': 3.0})

### Step 4: Generate Descriptions

In [9]:
def generate_description(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)


# Example
input_text = "Style: Industrial | Material: Metal | Color: Gray | Dimensions: 30L x 20W x 15H"
print(generate_description(input_text))

The X-Distances of the X-Distances: The X-Distances of the X-Distances and the X-Distances of the X-Distances of the X-Districts. The X-Distances of the X-Distances of the X-Distances of the X-Distances of the X-Distances of the X-D


# Key Considerations
1. Data Quality:

    - Ensure your input-output pairs are clean and consistent.
    - Augment data if the dataset is small (e.g., paraphrasing).

1. Model Size:

    - t5-small (fast but less accurate) → Good for testing.
    - t5-base or t5-large → Better results (requires more GPU RAM).

1. Hyperparameters:

    - Adjust num_train_epochs (3–10) and batch_size based on GPU.
    - Use learning_rate=3e-5 for smoother training.

1. Deployment:

    - Save the fine-tuned model:

        ```py
        model.save_pretrained("./fine-tuned-t5")
        tokenizer.save_pretrained("./fine-tuned-t5")
        ````

### Step 5: Save model

In [None]:
model.save_pretrained("./fine-tuned-t5")
tokenizer.save_pretrained("./fine-tuned-t5")

### Step 6: Load the model

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("./fine-tuned-t5")
tokenizer = BartTokenizer.from_pretrained("./fine-tuned-t5")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load your fine-tuned T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./fine-tuned-t5")
tokenizer = T5Tokenizer.from_pretrained("./fine-tuned-t5")


def generate_product_description(input_features, max_length=128, num_beams=4):
    """
    Generate product description from input features using T5
    
    Args:
        input_features (str): Product features in format "Style:...|Material:...|Color:..."
        max_length (int): Maximum length of generated description
        num_beams (int): Number of beams for beam search
        
    Returns:
        str: Generated product description
    """
    # T5 requires a task prefix
    input_text = f"generate product description: {input_features}"

    # Tokenize inputs
    inputs = tokenizer(
        input_text,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    # Generate description
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        # outputs = model.generate(
        #     ...
        #     temperature=0.7,          # More creative (0.7-1.0) vs conservative (0.1-0.3)
        #     top_k=50,                 # Sample from top 50 probable tokens
        #     top_p=0.95,               # Nucleus sampling threshold
        #     do_sample=True            # Enable sampling
        # )

    # Decode and clean output
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-processing for T5
    description = description.replace("<pad>", "").replace("</s>", "").strip()
    if description.lower().startswith("description:"):
        description = description[len("description:"):].strip()

    return description.capitalize()


# Example usage
input_features = "Style: Industrial | Material: Metal | Color: Gray | Dimensions: 30L x 20W x 15H | Features: Rust-proof"
generated_description = generate_product_description(input_features)

print("Input Features:")
print(input_features)
print("\nGenerated Description:")
print(generated_description)