In [None]:
# Install necessary libraries
!pip install -q transformers datasets peft accelerate scipy torch
!pip install -U bitsandbytes
!pip install -q datasets

In [None]:
# 1. Imports & Setup
from IPython import get_ipython
from IPython.display import display
import torch
import json
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel # Import for merging moved here

# 2. Hugging Face Configuration
HF_TOKEN = "PlaceholderForYourHuggingFaceToken"  # Replace with your actual token
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 3. Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)

# 4. Dataset Preparation
sample_data = [
    {"input": "What is AI?", "output": "AI stands for Artificial Intelligence."},
    {"input": "Define machine learning.", "output": "Machine learning is a field of AI focused on learning from data."}
]

with open("sample_dataset.json", "w") as f:
    for item in sample_data:
        f.write(json.dumps(item) + "\n")

dataset = load_dataset("json", data_files="sample_dataset.json", split="train")

def tokenize(example):
    prompt = f"### Instruction:\n{example['input']}\n\n### Response:\n{example['output']}"
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize)

# 5. Model Loading & LoRA Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HF_TOKEN,
    quantization_config=bnb_config,
    device_map="auto"
)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)

# 6. Training
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()

# 7. Saving the Model
model.save_pretrained("tinyllama-lora")
tokenizer.save_pretrained("tinyllama-lora")
print("LoRA-fine-tuned TinyLlama model saved.")

# 8. Merging LoRA with Base Model
# (Moved here for better organization)
base_model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    device_map="auto"
)
merged_model = PeftModel.from_pretrained(base_model, "tinyllama-lora")

merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("merged-tinyllama")

# 9. Saving Tokenizer (for the merged model)
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.save_pretrained("merged-tinyllama")

# 10. GGUF Conversion (Ollama Compatibility)
!git clone https://github.com/ggerganov/llama.cpp
%cd llama.cpp
!pip install -r requirements.txt

!python3 convert_hf_to_gguf.py \
  ../merged-tinyllama \
  --outfile ../tinyllama-merged.gguf \
  --outtype q8_0

from google.colab import files
files.download("/content/tinyllama-merged.gguf")

# 11. Ollama Modelfile Creation and Execution
# (Content remains the same)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


LoRA-fine-tuned TinyLlama model saved.
Cloning into 'llama.cpp'...
remote: Enumerating objects: 49735, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 49735 (delta 5), reused 4 (delta 4), pack-reused 49721 (from 2)[K
Receiving objects: 100% (49735/49735), 103.34 MiB | 21.38 MiB/s, done.
Resolving deltas: 100% (35881/35881), done.
/content/llama.cpp
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting numpy~=1.26.4 (from -r ./requirements/requirements-convert_legacy_llama.txt (line 1))
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting gguf>=0.1.0 (from -r ./requirements/requireme

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]0it [00:00, ?it/s]
INFO:hf-to-gguf:Loading model: merged-tinyllama
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:output.weight,               torch.float32 --> Q8_0, shape = {2048, 32000}
INFO:hf-to-gguf:token_embd.weight,           torch.float32 --> Q8_0, shape = {2048, 32000}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float32 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float32 --> Q8_0, shape = {5632, 2048}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float32 --> Q8_0, shape = {2048, 5632}
INFO:hf-to-gguf:bl

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>