In [None]:
import json
import requests

url = "https://raw.githubusercontent.com/clinc/oos-eval/master/data/data_small.json"
response = requests.get(url)
data = response.json()

print("Available splits:", data.keys())
print("Number of training examples:", len(data["train"]))
print("Number of validation examples:", len(data["val"]))
print("Number of test examples:", len(data["test"]))

print("\nSample training examples (utterance, intent):")
for example in data["train"][:3]:
    print(example)

Available splits: dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])
Number of training examples: 7500
Number of validation examples: 3000
Number of test examples: 4500

Sample training examples (utterance, intent):
['can you walk me through setting up direct deposits to my bank of internet savings account', 'direct_deposit']
['i want to switch to direct deposit', 'direct_deposit']
['set up direct deposit for me', 'direct_deposit']


In [None]:
train_data = data["train"] + data["oos_train"]
val_data   = data["val"]   + data["oos_val"]
test_data  = data["test"]  + data["oos_test"]

print("Train size:", len(train_data))
print("Val size:", len(val_data))
print("Test size:", len(test_data))


Train size: 7600
Val size: 3100
Test size: 5500


In [None]:

import pandas as pd
import numpy as np


train_df = pd.DataFrame(train_data, columns=["text","intent"])
val_df   = pd.DataFrame(val_data, columns=["text","intent"])
test_df  = pd.DataFrame(test_data, columns=["text","intent"])

print("\nSample training data:")
print(train_df.tail())


Sample training data:
                                                   text intent
7595  what percentage of species display cold bloode...    oos
7596              what does it mean to be an alpha male    oos
7597                      what animals have alpha males    oos
7598                      why do males want to be alpha    oos
7599  what's the average battery life of an android ...    oos


In [None]:
num_intents = train_df["intent"].nunique()
print("Number of unique intents:", num_intents)
print("\nSample intents:", train_df["intent"].unique()[:10])


Number of unique intents: 151

Sample intents: ['direct_deposit' 'carry_on' 'whisper_mode' 'text' 'recipe' 'smart_home'
 'who_do_you_work_for' 'rewards_balance' 'restaurant_reservation'
 'travel_notification']


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels = label_encoder.fit_transform(train_df["intent"])
val_labels   = label_encoder.transform(val_df["intent"])
test_labels  = label_encoder.transform(test_df["intent"])

print("Number of unique classes:", len(label_encoder.classes_))
print("Sample classes:", label_encoder.classes_[:10])
print("Encoded sample:", train_labels[:10])


Number of unique classes: 151
Sample classes: ['accept_reservations' 'account_blocked' 'alarm' 'application_status'
 'apr' 'are_you_a_bot' 'balance' 'bill_balance' 'bill_due' 'book_flight']
Encoded sample: [35 35 35 35 35 35 35 35 35 35]


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
train_vecs = sbert_model.encode(train_df["text"].tolist(), convert_to_numpy=True)
val_vecs   = sbert_model.encode(val_df["text"].tolist(), convert_to_numpy=True)
test_vecs  = sbert_model.encode(test_df["text"].tolist(), convert_to_numpy=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=2000)
clf.fit(train_vecs, train_labels)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

val_preds = clf.predict(val_vecs)
val_acc = accuracy_score(val_labels, val_preds)
print(f"Validation Accuracy: {val_acc:.3f}\n")

print("Validation Classification Report:")
print(classification_report(val_labels, val_preds, target_names=label_encoder.classes_))
print("Validation Classification Report")

test_preds = clf.predict(test_vecs)
test_acc = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy: {test_acc:.3f}\n")

print("Test Classification Report:")
print(classification_report(test_labels, test_preds, target_names=label_encoder.classes_))
print("Test Classification Report")


Validation Accuracy: 0.924

Validation Classification Report:
                           precision    recall  f1-score   support

      accept_reservations       0.80      1.00      0.89        20
          account_blocked       1.00      0.95      0.97        20
                    alarm       1.00      1.00      1.00        20
       application_status       0.95      1.00      0.98        20
                      apr       1.00      1.00      1.00        20
            are_you_a_bot       0.95      0.95      0.95        20
                  balance       0.89      0.85      0.87        20
             bill_balance       1.00      0.85      0.92        20
                 bill_due       1.00      1.00      1.00        20
              book_flight       1.00      1.00      1.00        20
               book_hotel       0.95      0.90      0.92        20
               calculator       0.87      1.00      0.93        20
                 calendar       0.90      0.95      0.93        20

# OOS Handling

1-Finding OOS and retrieve the probability distribution we got from logistic regression across all our classes :

In [None]:
oos_name = "oos"
class_names = label_encoder.classes_
oos_idx = list(class_names).index(oos_name)

val_probs = clf.predict_proba(val_vecs)
test_probs = clf.predict_proba(test_vecs)

2-We make  oos our ground truth ; 1 = OOS, 0 = in-domain :


In [None]:
y_true_val_binary = (val_labels == oos_idx).astype(int)
y_true_test_binary = (test_labels == oos_idx).astype(int)

3- This code finds the best OOS rejection threshold by testing values from 0.1 to 0.9. For each threshold, if the modelâ€™s max probability is below it, the sample is marked OOS; otherwise itâ€™s in-domain. Precision, recall, and F1 are computed, and the threshold with the highest F1 on validation is selected as the best trade-off, then used on the test set.

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

thresholds = np.linspace(0.1, 0.9, 9)
best_f1, best_th = -1, None

for th in thresholds:
    max_probs = val_probs.max(axis=1)
    y_pred_val_binary = (max_probs < th).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true_val_binary, y_pred_val_binary, average="binary", zero_division=0
    )
    print(f"Threshold {th:.2f} -> Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")
    if f1 > best_f1:
        best_f1, best_th = f1, th

print(f"\nBest threshold on val: {best_th:.2f} with F1: {best_f1:.3f}")


Threshold 0.10 -> Precision: 0.327, Recall: 0.330, F1: 0.328
Threshold 0.20 -> Precision: 0.218, Recall: 0.770, F1: 0.340
Threshold 0.30 -> Precision: 0.143, Recall: 0.890, F1: 0.246
Threshold 0.40 -> Precision: 0.096, Recall: 0.940, F1: 0.175
Threshold 0.50 -> Precision: 0.070, Recall: 0.990, F1: 0.131
Threshold 0.60 -> Precision: 0.053, Recall: 1.000, F1: 0.100
Threshold 0.70 -> Precision: 0.042, Recall: 1.000, F1: 0.081
Threshold 0.80 -> Precision: 0.035, Recall: 1.000, F1: 0.068
Threshold 0.90 -> Precision: 0.032, Recall: 1.000, F1: 0.063

Best threshold on val: 0.20 with F1: 0.340


Next we apply the best threshold to flag OOS samples on validation and test sets.
It classifies each sample as OOS if its max prediction probability is below the threshold.
classification_report then shows precision, recall, and F1 for OOS detection.

In [None]:
from sklearn.metrics import classification_report

max_probs_val = val_probs.max(axis=1)
y_pred_val_binary = (max_probs_val < best_th).astype(int)
print("\nBinary OOS detection on val:")
print(classification_report(y_true_val_binary, y_pred_val_binary,
                            target_names=["In-domain", "OOS"], zero_division=0))

max_probs_test = test_probs.max(axis=1)
y_pred_test_binary = (max_probs_test < best_th).astype(int)
print("\nBinary OOS detection on test:")
print(classification_report(y_true_test_binary, y_pred_test_binary,
                            target_names=["In-domain", "OOS"], zero_division=0))



Binary OOS detection on val:
              precision    recall  f1-score   support

   In-domain       0.99      0.91      0.95      3000
         OOS       0.22      0.77      0.34       100

    accuracy                           0.90      3100
   macro avg       0.60      0.84      0.64      3100
weighted avg       0.97      0.90      0.93      3100


Binary OOS detection on test:
              precision    recall  f1-score   support

   In-domain       0.95      0.91      0.93      4500
         OOS       0.67      0.77      0.72      1000

    accuracy                           0.89      5500
   macro avg       0.81      0.84      0.82      5500
weighted avg       0.90      0.89      0.89      5500



In [None]:
test_preds_labels = clf.predict(test_vecs)

test_preds_names = np.array([class_names[p] for p in test_preds_labels])
test_preds_names[y_pred_test_binary == 1] = oos_name

test_true_names = np.array([class_names[t] for t in test_labels])

print("\nFull classification report on test (with OOS threshold):")
print(classification_report(test_true_names, test_preds_names, zero_division=0))



Full classification report on test (with OOS threshold):
                           precision    recall  f1-score   support

      accept_reservations       0.90      0.90      0.90        30
          account_blocked       0.96      0.83      0.89        30
                    alarm       1.00      0.93      0.97        30
       application_status       1.00      1.00      1.00        30
                      apr       0.93      0.93      0.93        30
            are_you_a_bot       1.00      0.93      0.97        30
                  balance       0.88      0.77      0.82        30
             bill_balance       0.96      0.77      0.85        30
                 bill_due       0.96      0.90      0.93        30
              book_flight       1.00      0.87      0.93        30
               book_hotel       1.00      0.87      0.93        30
               calculator       0.87      0.87      0.87        30
                 calendar       0.95      0.70      0.81        30
   

In [None]:
import joblib, json
from pathlib import Path

export_dir = Path("intent_oos_model")
export_dir.mkdir(exist_ok=True)
joblib.dump(clf, export_dir / "intent_oos_model.pkl")
joblib.dump(label_encoder, export_dir / "label_encoder.pkl")
sbert_model.save(str(export_dir / "sbert_model"))

config = {
    "oos_name": oos_name,
    "best_threshold": float(best_th),
    "num_classes": len(label_encoder.classes_),
    "class_names": label_encoder.classes_.tolist()
}

with open(export_dir / "oos_config.json", "w") as f:
    json.dump(config, f, indent=4)

print(" All components saved successfully in folder:", export_dir.resolve())

print("\nSaved files:")
for file in export_dir.iterdir():
    print(" -", file.name)


âœ… All components saved successfully in folder: /content/intent_oos_model

Saved files:
 - label_encoder.pkl
 - intent_oos_model.pkl
 - sbert_model
 - oos_config.json


In [None]:
import shutil
from google.colab import files
model_dir = "intent_oos_model"

zip_path = shutil.make_archive(model_dir, "zip", model_dir)
print(" Model zipped successfully at:", zip_path)
files.download(zip_path)


ðŸ“¦ Model zipped successfully at: /content/intent_oos_model.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>