In [1]:
!pip install transformers trl peft bitsandbytes accelerate datasets -U

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading trl-0.23.0-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, trl
Successfully installed bitsandbytes-0.47.0 trl-0.23.0


In [2]:
import os
from google.colab import userdata
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

In [4]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [5]:
print("Loading and filtering for student-related conversations...")
# Load the dataset in streaming mode to avoid downloading the entire thing
dataset_stream = load_dataset("lmsys/lmsys-chat-1m", split="train", streaming=True)

# Define keywords to find relevant conversations
keywords = ["student", "university", "professor", "assignment", "deadline", "lecture", "course", "exam", "thesis", "college", "email", "reschedule"]

filtered_examples = []
# Iterate through the dataset and collect 1000 relevant examples
for example in dataset_stream:
    # Join all turns of a conversation into a single string to search for keywords
    conversation_text = " ".join([turn["content"] for turn in example["conversation"]]).lower()
    if any(keyword in conversation_text for keyword in keywords):
        filtered_examples.append(example)

    # Stop once we have enough samples
    if len(filtered_examples) >= 1000:
        break

# Convert the collected list of examples into a regular Hugging Face Dataset
dataset = Dataset.from_list(filtered_examples)
print(f"✅ Created a targeted dataset with {len(dataset)} student-related conversations.")

Loading and filtering for student-related conversations...


README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

✅ Created a targeted dataset with 1000 student-related conversations.


In [6]:
def format_lmsys_prompt(example):
    formatted_convo = ""
    for turn in example["conversation"]:
        role = "Human" if turn["role"] == "user" else "Assistant"
        content = turn["content"]
        # Use a standard chat format
        formatted_convo += f"### {role}:\n{content}\n\n"
    return {"text": formatted_convo}

dataset = dataset.map(format_lmsys_prompt, remove_columns=list(dataset.features))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [8]:
def tokenize_function(examples):
    # The 'text' column already exists, so we just tokenize it.
    return tokenizer(examples["text"], truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [10]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
# 4. Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(base_model, lora_config)

In [12]:
training_args = TrainingArguments(
    output_dir="./tiny-conversational-assistant",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=25,
    fp16=True,
    report_to="none",
)

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [None]:
print("Starting fine-tuning on Guanaco conversational dataset...")
trainer.train()
print("Fine-tuning finished!")

# Save the final model adapters
final_model_path = "./tiny-conversational-assistant-final"
trainer.save_model(final_model_path)
print(f"Model saved to {final_model_path}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting fine-tuning on Guanaco conversational dataset...




In [None]:
!zip -r /content/tiny-conversational-assistant-final.zip /content/tiny-conversational-assistant-final