## Imports

In [1]:
from datasets import load_dataset
from transformers import BartTokenizer
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments, BartTokenizer
import torch

### A. Load Dataset


In [3]:
dataset = load_dataset("csv", data_files="data/data_prepared.csv")
train_test = dataset["train"].train_test_split(test_size=0.2)
train_data = train_test["train"]
test_data = train_test["test"]

Generating train split: 0 examples [00:00, ? examples/s]

### B. Tokenize Data


In [4]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def tokenize_function(examples):
    inputs = tokenizer(
        examples["input"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    outputs = tokenizer(
        examples["output"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": outputs["input_ids"]
    }

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/2916 [00:00<?, ? examples/s]

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

### C. Fine-Tune Bart

In [5]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

training_args = TrainingArguments(
    output_dir="./bart-product-description",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Step,Training Loss,Validation Loss
500,2.0347,1.843344
1000,1.8412,1.714631
1500,1.7731,1.646743
2000,1.582,1.609477




TrainOutput(global_step=2187, training_loss=1.9231564940848087, metrics={'train_runtime': 11211.9665, 'train_samples_per_second': 0.78, 'train_steps_per_second': 0.195, 'total_flos': 666746817085440.0, 'train_loss': 1.9231564940848087, 'epoch': 3.0})

### Step 4: Generate Descriptions

In [9]:
def generate_description(input_text):
    inputs = tokenizer(input_text, return_tensors="pt",
                       max_length=128, truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Example
input_text = "Style: Industrial | Material: Metal | Color: Gray | Dimensions: 30L x 20W x 15H"
print(generate_description(input_text))



### Step 5: Save model

In [10]:
# Save model
model.save_pretrained("./fine-tuned-bart")
tokenizer.save_pretrained("./fine-tuned-bart")

# Load later
model = BartForConditionalGeneration.from_pretrained("./fine-tuned-bart")
tokenizer = BartTokenizer.from_pretrained("./fine-tuned-bart")

In [None]:
# Load your fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained("./fine-tuned-bart")
tokenizer = BartTokenizer.from_pretrained("./fine-tuned-bart")


def generate_product_description(input_features, max_length=128, num_beams=4):
    """
    Generate product description from input features
    
    Args:
        input_features (str): Product features in format "Style:...|Material:...|Color:..."
        max_length (int): Maximum length of generated description
        num_beams (int): Number of beams for beam search (higher=better quality but slower)
    
    Returns:
        str: Generated product description
    """
    # Prepare input text
    input_text = f"Generate product description: {input_features}"

    # Tokenize inputs
    inputs = tokenizer(
        input_text,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    # Generate description
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2  # Prevent word repetition
        )

    # Decode and clean output
    description = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-processing
    description = description.replace("Product Description:", "").strip()
    description = description.capitalize()

    return description


# Example usage
input_features = "Style: Industrial | Material: Metal | Color: Gray | Dimensions: 30L x 20W x 15H | Features: Rust-proof, Wall-mounted"
generated_description = generate_product_description(input_features)

print("Input Features:")
print(input_features)
print("\nGenerated Description:")
print(generated_description)