# Phase 1b: Qwen Distillation + Norwegian Polish

This notebook creates a high-quality Norwegian instruction dataset by combining:

- **Qwen3.5-35B-A3B** — generates factually rich, well-structured Norwegian answers
- **Borealis 4B** — polishes the Norwegian into natural, fluent Bokmål

The original Alpaca outputs were generated by gpt-3.5-turbo. Qwen is a much more capable model, so regenerating the outputs gives us better content. But Qwen's Norwegian may still have English-influenced phrasing — Borealis fixes that.

## Two-Pass Pipeline

The two models can't coexist in memory, so we run sequentially:

1. **Pass 1 (Qwen):** Load Qwen → generate Norwegian outputs for all instructions → save intermediate → unload
2. **Pass 2 (Borealis):** Load Borealis → polish Qwen's Norwegian → save final dataset

Both passes support checkpointing for safe resumption.

**Input:** `norwegian_alpaca_improved.parquet` (from Phase 1 — uses the improved instructions/inputs)
**Output:** `norwegian_alpaca_qwen_polished.parquet`

In [None]:
# Install the library
# On Colab:
#   !git clone https://github.com/your-username/NORAI-Tools.git /content/NORAI-Tools
#   %pip install -e /content/NORAI-Tools

%pip install -e ..

from norai_tools import AlpacaImprover, OUTPUT_FILE
from datasets import load_dataset, Dataset
import pandas as pd
import json
import os
import gc
import torch
from tqdm.auto import tqdm

In [None]:
# ============================================================
# Configuration
# ============================================================

# Input
IMPROVED_DATASET_PATH = OUTPUT_FILE  # "norwegian_alpaca_improved.parquet"

# Qwen generation (Pass 1)
QWEN_MODEL = "Qwen/Qwen3.5-35B-A3B"
QWEN_BATCH_SIZE = 4          # Small batches — Qwen is large
QWEN_MAX_NEW_TOKENS = 512
QWEN_CHECKPOINT = "qwen_generation_checkpoint.jsonl"
QWEN_INTERMEDIATE = "norwegian_alpaca_qwen_raw.parquet"
SYSTEM_PROMPT = "Du er en hjelpsom assistent. Svar alltid på norsk bokmål."

# Borealis polishing (Pass 2)
BOREALIS_BATCH_SIZE = 8
BOREALIS_CHECKPOINT = "borealis_polish_checkpoint.jsonl"

# Output
OUTPUT_PATH = "norwegian_alpaca_qwen_polished.parquet"
PUSH_TO_HUB = False
HUB_REPO_ID = "your-username/norwegian-alpaca-qwen-polished"

print(f"Qwen model:  {QWEN_MODEL}")
print(f"Input:       {IMPROVED_DATASET_PATH}")
print(f"Intermediate:{QWEN_INTERMEDIATE}")
print(f"Output:      {OUTPUT_PATH}")

In [None]:
# ============================================================
# Load improved dataset from Phase 1
# ============================================================

dataset = load_dataset("parquet", data_files=IMPROVED_DATASET_PATH, split="train")

print(f"Loaded: {len(dataset)} rows")
print(f"Columns: {dataset.column_names}")

required = ["instruction_improved", "input_improved", "instruction_en"]
missing = [c for c in required if c not in dataset.column_names]
if missing:
    raise ValueError(f"Missing columns: {missing}. Run Phase 1 first.")

pd.set_option("display.max_colwidth", 100)
display(
    dataset.select(range(3))
    .to_pandas()[["instruction_improved", "input_improved"]]
    .head()
)

In [None]:
# ============================================================
# Pass 1: Generate Norwegian outputs with Qwen
# ============================================================

from transformers import AutoModelForCausalLM, AutoTokenizer

# Load checkpoint (resume support)
qwen_outputs = []
if os.path.exists(QWEN_CHECKPOINT):
    with open(QWEN_CHECKPOINT, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                qwen_outputs.append(json.loads(line.strip())["output"])
    print(f"Resumed from checkpoint: {len(qwen_outputs)} rows done.")

resume_idx = len(qwen_outputs)
total = len(dataset)

if resume_idx >= total:
    print("Pass 1 already complete!")
else:
    print(f"Generating rows {resume_idx}–{total-1} ({total - resume_idx} remaining)")
    print(f"Loading {QWEN_MODEL}...")

    tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    model = AutoModelForCausalLM.from_pretrained(
        QWEN_MODEL,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model.eval()
    print("Model loaded.")

    for start in tqdm(range(resume_idx, total, QWEN_BATCH_SIZE), desc="Generating"):
        end = min(start + QWEN_BATCH_SIZE, total)
        batch = dataset.select(range(start, end))

        # Build chat prompts
        formatted_prompts = []
        for row in batch:
            user_content = row["instruction_improved"]
            input_text = row.get("input_improved", "") or ""
            if input_text.strip():
                user_content += f"\n\n{input_text}"
            messages = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_content},
            ]
            formatted = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            formatted_prompts.append(formatted)

        inputs = tokenizer(
            formatted_prompts, return_tensors="pt",
            padding=True, truncation=True, max_length=2048,
        ).to(model.device)

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=QWEN_MAX_NEW_TOKENS,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )

        # Decode generated tokens (everything after the padded input)
        padded_len = inputs["input_ids"].shape[1]
        batch_texts = []
        for i in range(len(formatted_prompts)):
            text = tokenizer.decode(outputs[i][padded_len:], skip_special_tokens=True)
            batch_texts.append(text)

        qwen_outputs.extend(batch_texts)

        # Checkpoint
        with open(QWEN_CHECKPOINT, "a", encoding="utf-8") as f:
            for text in batch_texts:
                f.write(json.dumps({"output": text}, ensure_ascii=False) + "\n")

    print(f"Generation complete: {len(qwen_outputs)} rows")

    # Unload Qwen
    del model, tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("Qwen model unloaded.")

In [None]:
# ============================================================
# Save intermediate + inspect Qwen raw outputs
# ============================================================

# Merge Qwen outputs into dataset
dataset = dataset.add_column("output_qwen_raw", qwen_outputs)
dataset.to_parquet(QWEN_INTERMEDIATE)
print(f"Intermediate saved: {QWEN_INTERMEDIATE}")

# Inspect samples
pd.set_option("display.max_colwidth", 120)
print("\nSample Qwen outputs:")
for i in range(min(3, len(dataset))):
    row = dataset[i]
    print(f"\n--- Row {i} ---")
    print(f"  Instruction: {row['instruction_improved'][:120]}")
    print(f"  Original:    {row.get('output_improved', '')[:200]}")
    print(f"  Qwen raw:    {row['output_qwen_raw'][:200]}")

In [None]:
# ============================================================
# Pass 2: Polish Qwen outputs with Borealis
# ============================================================

# If restarting from here, load the intermediate parquet
if "output_qwen_raw" not in dataset.column_names:
    dataset = load_dataset("parquet", data_files=QWEN_INTERMEDIATE, split="train")
    print(f"Loaded intermediate: {len(dataset)} rows")

# Load checkpoint (resume support)
polished_outputs = []
if os.path.exists(BOREALIS_CHECKPOINT):
    with open(BOREALIS_CHECKPOINT, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                polished_outputs.append(json.loads(line.strip())["output"])
    print(f"Resumed polish: {len(polished_outputs)} rows done.")

resume_idx = len(polished_outputs)

if resume_idx >= len(dataset):
    print("Pass 2 already complete!")
else:
    print(f"Polishing rows {resume_idx}–{len(dataset)-1} ({len(dataset) - resume_idx} remaining)")

    improver = AlpacaImprover(batch_size=BOREALIS_BATCH_SIZE)
    improver.load_model()
    print(f"Borealis loaded: {improver.model_name}")

    for start in tqdm(range(resume_idx, len(dataset), BOREALIS_BATCH_SIZE), desc="Polishing"):
        end = min(start + BOREALIS_BATCH_SIZE, len(dataset))

        # (qwen_output, english_instruction) — instruction_en gives
        # Borealis enough semantic context for language polishing
        batch_pairs = [
            (dataset[i]["output_qwen_raw"], dataset[i]["instruction_en"])
            for i in range(start, end)
        ]
        polished = improver.improve_batch(batch_pairs)
        polished_outputs.extend(polished)

        # Checkpoint
        with open(BOREALIS_CHECKPOINT, "a", encoding="utf-8") as f:
            for text in polished:
                f.write(json.dumps({"output": text}, ensure_ascii=False) + "\n")

    print(f"Polishing complete: {len(polished_outputs)} rows")

# Add polished column
dataset = dataset.add_column("output_qwen_polished", polished_outputs)

In [None]:
# ============================================================
# Compare: Original Alpaca vs Qwen Raw vs Qwen Polished
# ============================================================

pd.set_option("display.max_colwidth", 120)

print("=" * 80)
print("Side-by-side comparison")
print("=" * 80)
for i in range(min(5, len(dataset))):
    row = dataset[i]
    print(f"\n--- Row {i} ---")
    print(f"  Instruction:  {row['instruction_improved'][:120]}")
    print(f"  Original:     {row.get('output_improved', '')[:200]}")
    print(f"  Qwen raw:     {row['output_qwen_raw'][:200]}")
    print(f"  Qwen polished:{row['output_qwen_polished'][:200]}")

# Stats: how many outputs changed during polishing?
changed = sum(
    1 for i in range(len(dataset))
    if dataset[i]["output_qwen_raw"] != dataset[i]["output_qwen_polished"]
)
print(f"\nBorealis changed {changed}/{len(dataset)} outputs ({100*changed/len(dataset):.1f}%)")

In [None]:
# ============================================================
# Save final dataset + optional Hub push
# ============================================================

dataset.to_parquet(OUTPUT_PATH)
print(f"Saved to: {OUTPUT_PATH}")
print(f"  Rows: {len(dataset)}")
print(f"  Columns: {dataset.column_names}")

if PUSH_TO_HUB:
    dataset.push_to_hub(HUB_REPO_ID, private=True)
    print(f"Pushed to Hub: {HUB_REPO_ID}")
else:
    print("\nSet PUSH_TO_HUB = True and update HUB_REPO_ID to push to the Hub.")

# Cleanup notes
print("\nCheckpoint files (safe to delete after verifying output):")
for f in [QWEN_CHECKPOINT, BOREALIS_CHECKPOINT, QWEN_INTERMEDIATE]:
    if os.path.exists(f):
        print(f"  {f}")