In [1]:
!pip install transformers trl peft bitsandbytes accelerate datasets -U

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.47.0 trl-0.23.0


In [2]:
import os
from google.colab import userdata
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

In [3]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [17]:
print("Loading and filtering the Dolly-15k dataset for email/student tasks...")

# Load the high-quality instruction dataset
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Define keywords to find relevant instructions
keywords = [
    # Email specific
    "email", "letter", "note", "message", "subject:", "draft a reply", "write a response", "respond to",
    # Academic context
    "professor", "teacher", "TA", "university", "academic", "college", "lecture", "course", "exam", "thesis",
    # Professional context
    "professional", "colleague", "manager", "interview", "application", "job", "internship", "recommendation",
    # Common actions
    "assignment", "deadline", "reschedule", "request for", "follow up", "feedback", "inquiry", "apologize", "decline"
]


def is_relevant(example):
    text_to_check = (example['instruction'] + " " + example['category']).lower()
    return any(keyword in text_to_check for keyword in keywords)

filtered_dataset = dataset.filter(is_relevant)

# If we have more than 1000, select a random subset for fast training
if len(filtered_dataset) > 1000:
    filtered_dataset = filtered_dataset.shuffle(seed=42).select(range(1000))

print(f"✅ Created a targeted dataset with {len(filtered_dataset)} relevant examples.")

Loading and filtering the Dolly-15k dataset for email/student tasks...


Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

✅ Created a targeted dataset with 517 relevant examples.


In [18]:
def format_dolly_prompt(example):
    instruction = example["instruction"]
    context = example["context"]
    response = example["response"]
    if context:
        return f"### Instruction:\n{instruction}\n\n### Context:\n{context}\n\n### Response:\n{response}"
    else:
        return f"### Instruction:\n{instruction}\n\n### Response:\n{response}"

def create_text_column(example):
    return {"text": format_dolly_prompt(example)}

dataset = filtered_dataset.map(create_text_column, remove_columns=list(filtered_dataset.features))

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

In [19]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

In [20]:
def tokenize_function(examples):
    # The 'text' column already exists, so we just tokenize it.
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/517 [00:00<?, ? examples/s]

In [21]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [22]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

In [23]:
# 4. Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(base_model, lora_config)

In [24]:
training_args = TrainingArguments(
    output_dir="./tiny-conversational-assistant",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=1,
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    report_to="none",
)

In [25]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer, data_collator=data_collator)

  trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, tokenizer=tokenizer, data_collator=data_collator)


In [26]:
print("\nStarting fine-tuning on the filtered Dolly-15k dataset...")
trainer.train()
print("Fine-tuning finished!")

# Save the final model adapters
final_model_path = "./tiny-conversational-assistant-final"
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.



Starting fine-tuning on the filtered Dolly-15k dataset...


Step,Training Loss
10,2.0237
20,2.0262
30,1.9701
40,2.0025
50,2.1192
60,1.9628


Fine-tuning finished!
Model saved to ./tiny-conversational-assistant-final


In [27]:
!zip -r /content/tiny-conversational-assistant-final.zip /content/tiny-conversational-assistant-final

updating: content/tiny-conversational-assistant-final/ (stored 0%)
updating: content/tiny-conversational-assistant-final/training_args.bin (deflated 53%)
updating: content/tiny-conversational-assistant-final/README.md (deflated 66%)
updating: content/tiny-conversational-assistant-final/adapter_config.json (deflated 56%)
updating: content/tiny-conversational-assistant-final/adapter_model.safetensors (deflated 8%)
updating: content/tiny-conversational-assistant-final/special_tokens_map.json (deflated 73%)
updating: content/tiny-conversational-assistant-final/tokenizer.model (deflated 55%)
updating: content/tiny-conversational-assistant-final/tokenizer.json (deflated 85%)
updating: content/tiny-conversational-assistant-final/tokenizer_config.json (deflated 69%)
updating: content/tiny-conversational-assistant-final/chat_template.jinja (deflated 60%)
