In [1]:
!pip install -q bitsandbytes
!pip install -q accelerate==1.6.0
!pip install -q transformers==4.51.3
!pip install -q peft==0.15.2
!pip install -q datasets==2.16.1
!pip install -q sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# Load dataset from Hugging Face
dataset = load_dataset("smangrul/ad-copy-generation", split="train")

# Use the TinyLLaMA Chat model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load tokenizer and model (8-bit for memory efficiency)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto"
)

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # works for LLaMA-style models
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Preprocessing: Tokenize dataset
def tokenize(example):
    return tokenizer(
        example["content"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# Training configuration
training_args = TrainingArguments(
    output_dir="./tinyllama-ads-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save model
trainer.save_model("./tinyllama-ads-lora")
tokenizer.save_pretrained("./tinyllama-ads-lora")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/166k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/141 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.7801
20,2.7279
30,2.6832
40,2.6274
50,2.5827
60,2.5364
70,2.4565
80,2.4046
90,2.3543
100,2.3222


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('./tinyllama-ads-lora/tokenizer_config.json',
 './tinyllama-ads-lora/special_tokens_map.json',
 './tinyllama-ads-lora/tokenizer.model',
 './tinyllama-ads-lora/added_tokens.json',
 './tinyllama-ads-lora/tokenizer.json')

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model and LoRA adapter
base_model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto",
    torch_dtype="auto",
)

# Load fine-tuned LoRA weights
model = PeftModel.from_pretrained(base_model, "./tinyllama-ads-lora")

# Merge LoRA weights into base model
merged_model = model.merge_and_unload()

# Save merged model and tokenizer
merged_model.save_pretrained("./tinyllama-ads-merged")
tokenizer = AutoTokenizer.from_pretrained("./tinyllama-ads-lora")
tokenizer.save_pretrained("./tinyllama-ads-merged")


('./tinyllama-ads-merged/tokenizer_config.json',
 './tinyllama-ads-merged/special_tokens_map.json',
 './tinyllama-ads-merged/tokenizer.model',
 './tinyllama-ads-merged/added_tokens.json',
 './tinyllama-ads-merged/tokenizer.json')

In [11]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Copy model to Drive
shutil.copytree(
    "./tinyllama-ads-merged",
    "/content/drive/MyDrive/tinyllama-ads-merged"
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/tinyllama-ads-merged'

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the merged model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "./tinyllama-ads-merged",
    device_map="auto",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained("./tinyllama-ads-merged")

# Set model to eval mode
model.eval()

# Correctly formatted prompt (same as training)
prompt = (
    "<s>[INST] <<SYS>>\n"
    "Create a text ad given the following product and description.\n"
    "<</SYS>>\n\n"
    "Product: BATATIS\n"
    "Description: A very spicy chips that will burn your throat.\n"
    "[/INST] Ad:"
)


# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.6,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


[INST] <<SYS>>
Create a text ad given the following product and description.
<</SYS>>

Product: BATATIS
Description: A very spicy chips that will burn your throat.
[/INST] Ad: Spice up your taste buds with this hot, tasty snack! Perfect for those who love to mix things up in their diet. 🔥🌶️ #BatatisChips #HotTaste #Spicyness
[/ENV] ✨




---

