# Phase 1: Improve Norwegian Alpaca Dataset with Borealis

This notebook uses **NbAiLab/borealis-4b-instruct-preview** (a Norwegian-focused Gemma 3 4B fine-tune) to improve the Norwegian text columns in the **NbAiLab/norwegian-alpaca** dataset.

The original dataset was machine-translated from Stanford Alpaca via gpt-3.5-turbo and contains translation artifacts, unnatural phrasing, and inconsistent quality. Borealis rewrites the three Norwegian columns (`instruction`, `input`, `output`) into natural, fluent Norwegian Bokmål while preserving the original meaning.

**Dataset:** 51,942 rows, 6 columns (3 Norwegian + 3 English)

**Pipeline:**
1. Load the dataset from HuggingFace
2. For each Norwegian text field, prompt Borealis with a few-shot Norwegian refinement prompt
3. Validate responses (strip preambles, check for hallucination, length sanity)
4. Save improved dataset with both original and improved columns

In [None]:
# Install the library (editable mode)
# On Colab, clone the repo first:
#   !git clone https://github.com/your-username/NORAI-Tools.git /content/NORAI-Tools
#   %pip install -e /content/NORAI-Tools

%pip install -e ..

from norai_tools import (
    AlpacaImprover,
    load_alpaca,
    save_improved,
    DEFAULT_MODEL,
    DEFAULT_BATCH_SIZE,
    DEFAULT_CHECKPOINT_EVERY,
    CHECKPOINT_FILE,
    OUTPUT_FILE,
)
import pandas as pd

In [None]:
# ============================================================
# Configuration
# ============================================================

MODEL_NAME = DEFAULT_MODEL
DEVICE = "auto"              # "auto" lets accelerate pick GPU/MPS/CPU
BATCH_SIZE = DEFAULT_BATCH_SIZE
CHECKPOINT_EVERY = DEFAULT_CHECKPOINT_EVERY

# Optional: HuggingFace Hub push
PUSH_TO_HUB = False
HUB_REPO_ID = "your-username/norwegian-alpaca-improved"  # Change this

print(f"Model: {MODEL_NAME}")
print(f"Device: {DEVICE}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Checkpoint every: {CHECKPOINT_EVERY} rows")
print(f"Output: {OUTPUT_FILE}")

In [None]:
# ============================================================
# Load dataset from HuggingFace
# ============================================================

dataset = load_alpaca()

print(f"Dataset loaded: {len(dataset)} rows")
print(f"Columns: {dataset.column_names}")

pd.set_option("display.max_colwidth", 100)
display(
    dataset.select(range(5))
    .to_pandas()[["instruction", "instruction_en", "input", "output"]]
    .head()
)

In [None]:
# ============================================================
# Load Borealis model
# ============================================================

improver = AlpacaImprover(
    model_name=MODEL_NAME,
    device=DEVICE,
    batch_size=BATCH_SIZE,
)
improver.load_model()

print(f"Model loaded: {improver.model_name}")
print(f"Dtype: {next(improver.model.parameters()).dtype}")

In [None]:
# ============================================================
# Run improvement loop (with checkpoint support)
# ============================================================

# To test on a small subset first, uncomment:
# dataset = dataset.select(range(100))

improved_dataset = improver.improve_dataset(
    dataset,
    checkpoint_path=CHECKPOINT_FILE,
    checkpoint_every=CHECKPOINT_EVERY,
)

print(f"\nProcessed {len(improved_dataset)} rows.")
print(f"Columns: {improved_dataset.column_names}")

In [None]:
# ============================================================
# Inspect results — side-by-side comparison + change stats
# ============================================================

results_df = improved_dataset.to_pandas()
pd.set_option("display.max_colwidth", 120)

print("=" * 80)
print("INSTRUCTION: Original vs Improved")
print("=" * 80)
for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"\n--- Row {i} ---")
    print(f"  ORIGINAL:  {row['instruction']}")
    print(f"  IMPROVED:  {row['instruction_improved']}")
    print(f"  ENGLISH:   {row['instruction_en']}")

print()
print("=" * 80)
print("OUTPUT: Original vs Improved")
print("=" * 80)
for i in range(min(5, len(results_df))):
    row = results_df.iloc[i]
    print(f"\n--- Row {i} ---")
    print(f"  ORIGINAL:  {row['output'][:200]}")
    print(f"  IMPROVED:  {row['output_improved'][:200]}")
    print(f"  ENGLISH:   {row['output_en'][:200]}")

# Change statistics
print("\n" + "=" * 80)
print("Change Statistics")
print("=" * 80)
for col in ["instruction", "input", "output"]:
    changed = (results_df[col] != results_df[f"{col}_improved"]).sum()
    total = len(results_df)
    print(f"  {col}: {changed}/{total} changed ({100*changed/total:.1f}%)")

In [None]:
# ============================================================
# Save output to parquet + optional Hub push
# ============================================================

save_improved(
    improved_dataset,
    path=OUTPUT_FILE,
    push_to_hub=PUSH_TO_HUB,
    hub_repo=HUB_REPO_ID if PUSH_TO_HUB else None,
)

print(f"Saved to: {OUTPUT_FILE}")
print(f"  Rows: {len(improved_dataset)}")
print(f"  Columns: {improved_dataset.column_names}")

if not PUSH_TO_HUB:
    print("\nSet PUSH_TO_HUB = True and update HUB_REPO_ID to push to the Hub.")