## Setup - Don't read - just run

In [None]:
%%capture
!pip install --upgrade datasets

In [None]:
import datasets
from datasets import load_dataset, concatenate_datasets
from openai import OpenAI
import random
from typing import List, Tuple
from tqdm import tqdm
import pandas as pd
import time
import re
from getpass import getpass

In [None]:

LABEL_RE = re.compile(r'"label"\s*:\s*"([^"]+)"', re.I)

def extract_label(raw: str) -> str:
    """
    Find the first occurrence of "label": "…".
    Works even if the JSON block has minor syntax issues
    (e.g. trailing commas, single quotes, extra keys).
    """
    m = LABEL_RE.search(raw)
    if not m:
        return 'oos'
    return m.group(1).strip().lower()

def get_clinc_test_subset(k: int,
                          n_per_class: int,
                          seed: int = 42):
    ds_test = load_dataset("clinc_oos", "plus", split="test")
    all_labels = ds_test.features["intent"].names

    rng = random.Random(seed)
    selected_labels = ["oos"] + rng.sample(
        [lbl for lbl in all_labels if lbl != "oos"], k - 1
    )
    label2id = {lbl: i for i, lbl in enumerate(all_labels)}

    per_class_slices = []
    for lbl in selected_labels:
        cls_ds = ds_test.filter(lambda ex, lid=label2id[lbl]: ex["intent"] == lid)
        take = min(n_per_class, len(cls_ds))
        per_class_slices.append(
            cls_ds.shuffle(seed=seed).select(range(take))  # <- stays a Dataset
        )

    balanced_subset = concatenate_datasets(per_class_slices).shuffle(seed=seed)

    return {"intents": selected_labels, "test": balanced_subset}

## LLM Clients
Please setup your LLM API Key.

In [None]:
LLM_API_KEY = getpass("Please go to aibrary.dev - sign up, get your key and paste it here ")
LLM_URL = "https://api.aibrary.dev/v0"
MODEL_ID = "Meta-Llama-3.1-8B-Instruct-Turbo"

# OR

# LLM_API_KEY = getpass("Please go to together.ai - sign up, get your key and paste it here ")
# LLM_URL = "https://api.together.xyz/v1"
# MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

In [None]:
class LLMClient:
    def __init__(self, api_key: str, model_name: str, base_url: str, temperature: float = 0.0):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.model_name = model_name
        self.temperature = temperature

    def call(self, user_prompt: str, system_prompt: str = '') -> str:
        resp = self.client.chat.completions.create(
            model=self.model_name,
            temperature=self.temperature,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
        )
        time.sleep(random.random())
        return resp.choices[0].message.content

In [None]:
llm_client = LLMClient(api_key=LLM_API_KEY, model_name=MODEL_ID, base_url=LLM_URL)

In [None]:
llm_client.call('are you available?')

## Dataset

In [None]:
%%capture
intents_dataset = get_clinc_test_subset(10, 10) # gets 10 classes, 10 samples per class
intent_ds = intents_dataset['test']
intents = intents_dataset['intents']


In [None]:
for i in range(0,16, 5):
    example = intent_ds[i]
    print(f"Text: {example['text']}")
    print(f"Label: {intent_ds.features['intent'].names[example['intent']]}")
    print("---")

In [None]:
intents

## LLM for Intent Classification

In [None]:
def build_system_prompt(intents: List[str]) -> str:
    """Return a short system instruction listing all valid intents."""
    return (
        "You are an intent classifier for a voice assistant. "
        "Return exactly one intent label from the list below. "
        "If the user request does not match any intent, return 'oos'.\n\n"
        + ", ".join(intents)
    )
system_prompt = build_system_prompt(intents)
print(system_prompt)

In [None]:
user_prompt = """
User: i think i've misplaced my phone
"""
llm_client.call(user_prompt, system_prompt)

In [None]:
def build_user_prompt(utterance) -> str:
  return f"User: {utterance}\n"

user_prompt = build_user_prompt("where did i place my phone")
print(user_prompt)

In [None]:
llm_client.call(user_prompt, system_prompt)

In [None]:
class InferenceRunner:
    def __init__(self, llm_client, build_system_prompt, build_user_prompt):
        self.llm_client = llm_client
        self.build_system_prompt = build_system_prompt
        self.build_user_prompt = build_user_prompt

    def run(self, dataset_split, intents):
        sys_prompt = self.build_system_prompt(intents)
        results = []
        for ex in tqdm(dataset_split, desc="inference"):
            user_prompt = self.build_user_prompt(ex["text"])
            pred = self.llm_client.call(user_prompt, sys_prompt).strip().lower()
            results.append({
                "text": ex["text"],
                "pred": pred,
            })
        return results

In [None]:
inf_runner = InferenceRunner(llm_client, build_system_prompt, build_user_prompt)
results = inf_runner.run(intent_ds, intents)

## Evaluation

In [None]:
def to_dataframe(predictions, dataset_split, intents):
    records = []
    for pred_obj, gt in zip(predictions, dataset_split):
        records.append({
            "text": pred_obj["text"],
            "pred": pred_obj["pred"],
            "label": dataset_split.features['intent'].names[gt["intent"]],
        })
    return pd.DataFrame(records)

df = to_dataframe(results, intent_ds, intents)
accuracy = (df["pred"] == df["label"]).mean()
print("Accuracy:", 100*accuracy, "%")

## Error Analysis

In [None]:
error_df = df[df.label != df.pred]

In [None]:
for _, row in error_df.iterrows():
  print('Query:', row['text'])
  print('Expected:', row['label'], 'Predicted:', row['pred'])
  print('----')

In [None]:
def build_system_prompt_cot(intents):
    """
    Model thinks first, then prints one clean JSON line.
    """
    return (
        "You are an intent classifier for a voice assistant.\n"
        "Think step-by-step first.\n"
        "Then, on a new line, output **exactly** this one-line JSON:\n"
        '{"label": "<one of these intents or \'oos\'>"}\n'
        "No markdown, no extra keys, no trailing commas.\n\n"
        "Valid intents: " + ", ".join(intents)
    )

inf_runner = InferenceRunner(llm_client, build_system_prompt_cot, build_user_prompt)
results = inf_runner.run(intent_ds, intents)

In [None]:
df = to_dataframe(results, intent_ds, intents)
df['pred_label'] = df['pred'].apply(extract_label)
accuracy = (df["pred_label"] == df["label"]).mean()
print("Accuracy:", 100*accuracy, "%")

In [None]:
error_df = df[df.label != df.pred_label]
for _, row in error_df.iterrows():
  print('Query:', row['text'])
  print('Expected:', row['label'])
  print('Predicted:', row['pred'])
  print('----')

## Self-consistency

In [None]:
def majority_vote_label(client, utter: str, sys: str, m: int = 3) -> str:
    from collections import Counter
    labels = [
        extract_label(client.call(build_user_prompt(utter), sys))
        for _ in range(m)
    ]
    return Counter(labels).most_common(1)[0][0]


In [None]:
llm_temp = LLMClient(
    api_key=LLM_API_KEY,
    model_name=MODEL_ID,
    base_url=LLM_URL,
    temperature=0.8          # diversity is essential for voting
)
sys_prompt = build_system_prompt(intents)   # or build_system_prompt_cot

In [None]:
error_df = error_df.copy()
error_df["pred_mv"] = error_df["text"].apply(
    lambda t: majority_vote_label(llm_temp, t, sys_prompt, m=3)
)

In [None]:
fixed = (error_df.pred_mv == error_df.label).sum()
print(f"Self-consistency corrected {fixed}/{len(error_df)} errors "
      f"({fixed/len(error_df)*100:.1f} %).")

## Activity - Trim the Thoughts, Keep the Smarts

### Chain of Thought is taking x times longer
Your task is to reduce the inference time by guiding the model to reason only briefly and then output the decision to reduce latency and token usage.

In [None]:
# todo