# Code README Writer â€” SFT Training on Google Colab

Fine-tune TinyLlama-1.1B with QLoRA to generate README.md files from repository structure and code.

**Requirements:** Free Colab T4 GPU (~2-3 hours training time)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/)

## 1. Setup & Install Dependencies

In [None]:
!pip install -q torch transformers datasets peft trl bitsandbytes accelerate
!pip install -q sentencepiece protobuf wandb rouge-score gradio

In [None]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

## 2. Load & Prepare Dataset

We use pre-collected GitHub repo-README pairs. Each example contains:
- `repo_name`: The repository identifier
- `file_tree`: Directory structure listing
- `code_snippets`: Contents of key source files
- `readme_content`: The ground-truth README.md

In [None]:
import json
from datasets import Dataset, DatasetDict

# Load from HuggingFace Hub or local file
# Option A: From local JSONL (upload readme_dataset.jsonl to Colab)
# from google.colab import files
# uploaded = files.upload()  # upload readme_dataset.jsonl

# Option B: From HuggingFace Hub (after pushing dataset)
# dataset = load_dataset("your-username/code-readme-pairs")

# For this notebook, we'll create a small demo dataset
demo_examples = [
    {
        "repo_name": "fastapi-todo-app",
        "file_tree": "app/\napp/__init__.py\napp/main.py\napp/models.py\napp/database.py\nrequirements.txt\nDockerfile",
        "code_snippets": "--- requirements.txt ---\nfastapi==0.104.1\nuvicorn==0.24.0\nsqlalchemy==2.0.23\n\n--- app/main.py ---\nfrom fastapi import FastAPI\napp = FastAPI(title='Todo API')\n",
        "readme_content": "# FastAPI Todo App\n\nA RESTful todo API built with FastAPI and SQLAlchemy.\n\n## Installation\n\n```bash\npip install -r requirements.txt\n```\n\n## Usage\n\n```bash\nuvicorn app.main:app --reload\n```\n\n## Features\n\n- CRUD operations for todos\n- SQLAlchemy ORM\n- Docker support\n"
    }
]

# In practice, load your full dataset:
# examples = []
# with open('readme_dataset.jsonl') as f:
#     for line in f:
#         examples.append(json.loads(line))

examples = demo_examples  # Replace with full dataset
dataset = Dataset.from_list(examples)
split = dataset.train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({"train": split["train"], "test": split["test"]})

print(f"Train: {len(dataset_dict['train'])} examples")
print(f"Test: {len(dataset_dict['test'])} examples")

## 3. Load Base Model with QLoRA (4-bit Quantization)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 4-bit quantization config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Model loaded: {MODEL_NAME}")
print(f"Model size: {model.num_parameters() / 1e6:.1f}M parameters")

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 4. Generate BEFORE Example (Base Model)

Let's see what the base model produces before fine-tuning.

In [None]:
def build_prompt(repo_name, file_tree, code_snippets):
    return (
        "<|system|>\n"
        "You are a technical writer that generates comprehensive README.md files "
        "for code repositories. Given the repository structure and code contents, "
        "write a clear, well-structured README.</s>\n"
        "<|user|>\n"
        "Generate a README.md for the following repository:\n\n"
        f"Repository name: {repo_name}\n\n"
        f"File structure:\n{file_tree}\n\n"
        f"Key files:\n{code_snippets}</s>\n"
        "<|assistant|>\n"
    )

def generate_readme(model, prompt, max_new_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs, max_new_tokens=max_new_tokens,
            temperature=0.7, top_p=0.9, repetition_penalty=1.15,
            do_sample=True, pad_token_id=tokenizer.eos_token_id,
        )
    generated = outputs[0][inputs['input_ids'].shape[1]:]
    return tokenizer.decode(generated, skip_special_tokens=True).strip()

# Test with example
test_example = examples[0]
prompt = build_prompt(test_example['repo_name'], test_example['file_tree'], test_example['code_snippets'])

print("=" * 60)
print("BEFORE FINE-TUNING (Base Model Output)")
print("=" * 60)
base_output = generate_readme(model, prompt)
print(base_output)

## 5. Train with SFT

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# Prompt template
TEMPLATE = """<|system|>
You are a technical writer that generates comprehensive README.md files for code repositories. Given the repository structure and code contents, write a clear, well-structured README.</s>
<|user|>
Generate a README.md for the following repository:

Repository name: {repo_name}

File structure:
{file_tree}

Key files:
{code_snippets}</s>
<|assistant|>
{readme_content}</s>"""

def formatting_func(examples):
    outputs = []
    for i in range(len(examples['repo_name'])):
        text = TEMPLATE.format(
            repo_name=examples['repo_name'][i],
            file_tree=examples['file_tree'][i],
            code_snippets=examples['code_snippets'][i],
            readme_content=examples['readme_content'][i],
        )
        outputs.append(text)
    return outputs

# Only compute loss on assistant response
response_template = "<|assistant|>\n"
collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
)

# Training arguments (tuned for Colab T4)
training_args = TrainingArguments(
    output_dir="./readme-writer-checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    fp16=True,
    optim="paged_adamw_32bit",
    max_grad_norm=0.3,
    group_by_length=True,
    report_to="none",  # Set to "wandb" for experiment tracking
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    formatting_func=formatting_func,
    data_collator=collator,
    max_seq_length=2048,
    packing=False,
)

print("Starting training...")
train_result = trainer.train()
print(f"Training loss: {train_result.metrics['train_loss']:.4f}")

In [None]:
# Save the fine-tuned adapter
ADAPTER_PATH = "./readme-writer-adapter"
trainer.save_model(ADAPTER_PATH)
tokenizer.save_pretrained(ADAPTER_PATH)
print(f"Adapter saved to {ADAPTER_PATH}")

## 6. Generate AFTER Example (Fine-tuned Model)

Now let's generate with the same prompt and compare!

In [None]:
print("=" * 60)
print("AFTER FINE-TUNING (SFT + QLoRA Output)")
print("=" * 60)
finetuned_output = generate_readme(model, prompt)
print(finetuned_output)

In [None]:
# Side-by-side comparison
print("\n" + "=" * 80)
print("SIDE-BY-SIDE COMPARISON")
print("=" * 80)

print("\n--- BEFORE (Base Model) ---")
print(f"Length: {len(base_output)} chars")
print(f"Headings: {base_output.count('#')}")
print(f"Code blocks: {base_output.count('```') // 2}")
print()
print(base_output[:500])

print("\n--- AFTER (Fine-tuned) ---")
print(f"Length: {len(finetuned_output)} chars")
print(f"Headings: {finetuned_output.count('#')}")
print(f"Code blocks: {finetuned_output.count('```') // 2}")
print()
print(finetuned_output[:500])

## 7. Evaluate with ROUGE Scores

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

reference = test_example['readme_content']

base_scores = scorer.score(reference, base_output)
ft_scores = scorer.score(reference, finetuned_output)

print(f"{'Metric':<10} {'Base':>10} {'Fine-tuned':>12} {'Improvement':>14}")
print("-" * 50)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    base_f = base_scores[metric].fmeasure
    ft_f = ft_scores[metric].fmeasure
    improvement = ((ft_f - base_f) / max(base_f, 0.001)) * 100
    print(f"{metric:<10} {base_f:>10.4f} {ft_f:>12.4f} {improvement:>+13.1f}%")

## 8. Push to HuggingFace Hub (Optional)

In [None]:
# Uncomment and fill in to push your adapter to HuggingFace
# from huggingface_hub import login
# login(token="your-hf-token")
#
# model.push_to_hub("your-username/code-readme-writer-tinyllama-lora")
# tokenizer.push_to_hub("your-username/code-readme-writer-tinyllama-lora")
# print("Adapter pushed to HuggingFace Hub!")

## 9. Interactive Demo

Run the cell below to launch an interactive Gradio demo right in Colab.

In [None]:
import gradio as gr

def generate_for_demo(repo_name, file_tree, code_snippets):
    prompt = build_prompt(repo_name, file_tree, code_snippets)
    return generate_readme(model, prompt, max_new_tokens=1024)

demo = gr.Interface(
    fn=generate_for_demo,
    inputs=[
        gr.Textbox(label="Repository Name", value="my-project"),
        gr.Textbox(label="File Structure", lines=8, value="src/\nsrc/main.py\ntests/\nrequirements.txt"),
        gr.Textbox(label="Key Code Files", lines=10, value="--- src/main.py ---\nprint('hello')"),
    ],
    outputs=gr.Markdown(label="Generated README"),
    title="Code README Writer",
    description="Generate README.md files from repository structure using a fine-tuned TinyLlama model.",
)

demo.launch(share=True)