In [1]:
import pandas as pd
from datasets import load_dataset
import os
from pathlib import Path

## Load the Dataset

In [2]:
# Load the CLINIC oos dataset from Hugging Face
dataset = load_dataset("clinc/clinc_oos", "plus")
print("Available splits:", list(dataset.keys()))

Available splits: ['train', 'validation', 'test']


In [3]:
# Display basic info about the dataset
print(dataset["train"])
print("Features:", dataset["train"].features)
print("First example:", dataset["train"][0])

Dataset({
    features: ['text', 'intent'],
    num_rows: 15250
})
Features: {'text': Value('string'), 'intent': ClassLabel(names=['restaurant_reviews', 'nutrition_info', 'account_blocked', 'oil_change_how', 'time', 'weather', 'redeem_rewards', 'interest_rate', 'gas_type', 'accept_reservations', 'smart_home', 'user_name', 'report_lost_card', 'repeat', 'whisper_mode', 'what_are_your_hobbies', 'order', 'jump_start', 'schedule_meeting', 'meeting_schedule', 'freeze_account', 'what_song', 'meaning_of_life', 'restaurant_reservation', 'traffic', 'make_call', 'text', 'bill_balance', 'improve_credit_score', 'change_language', 'no', 'measurement_conversion', 'timer', 'flip_coin', 'do_you_have_pets', 'balance', 'tell_joke', 'last_maintenance', 'exchange_rate', 'uber', 'car_rental', 'credit_limit', 'oos', 'shopping_list', 'expiration_date', 'routing', 'meal_suggestion', 'tire_change', 'todo_list', 'card_declined', 'rewards_balance', 'change_accent', 'vaccines', 'reminder_update', 'food_last', 'cha

## Define Intent Mapping

In [4]:
# We define our mapping from the source dataset's integer labels to our new intent names.

# Our intent : [list of clinic_oos integer labels]
INTENT_MAPPING = {
    "greeting": [82],			  # greeting
    "positive_reply": [107, 124], # yes, thank_you
    "negative_reply": [30],       # no
    "bye": [114],                 # goodbye
    "oos": [42]                   # oos
}

# To make lookup faster, we'll reverse the mapping
# {source_label: our_intent}
LABEL_TO_INTENT = {label: intent for intent, labels in INTENT_MAPPING.items() for label in labels}

## Extract and Remap Data from All Splits

In [5]:
# We'll iterate through every split (train, validation, test) and extract the
# entries that match our required intents.

all_entries = []

for split_name in dataset.keys():
    print(f"Processing split: '{split_name}'...")
    current_split = dataset[split_name]
    
    for entry in current_split:
        source_label = entry['intent']
        
        # Check if the entry's intent is one we need to map
        if source_label in LABEL_TO_INTENT:
            remapped_intent = LABEL_TO_INTENT[source_label]
            all_entries.append({
                "text": entry['text'],
                "intent": remapped_intent
            })

print(f"\nExtraction complete. Total remapped entries found: {len(all_entries)}")

Processing split: 'train'...
Processing split: 'validation'...
Processing split: 'test'...

Extraction complete. Total remapped entries found: 2100


## Visualize Entry Counts

In [6]:
# Before saving, let's create a DataFrame and visualize the number of
# entries we've gathered for each of our new intents.

df = pd.DataFrame(all_entries)
intent_counts = df['intent'].value_counts()

print("--- Number of Entries Gathered per Intent ---")
print(intent_counts)

--- Number of Entries Gathered per Intent ---
intent
oos               1350
positive_reply     300
negative_reply     150
bye                150
greeting           150
Name: count, dtype: int64


## Sampling for class balance

In [7]:
# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# --- Sampling Logic ---
# We want to sample 150 entries per intent.
N_SAMPLES = 150

# We group by 'intent' and then apply a sampling function to each group.
# The lambda function is key to handling intents with fewer than N_SAMPLES entries.
# It takes the minimum of the group size and our desired sample size.
# `random_state` ensures the sampling is reproducible.
sampled_df = (
    df.groupby('intent', group_keys=False)
    .apply(lambda x: x.sample(n=min(len(x), N_SAMPLES), random_state=42))
)

print("\n--- Sampled Dataset Info ---")
print(f"Total entries in sampled data: {len(sampled_df)}")
print("Sampled counts per intent:")
print(sampled_df['intent'].value_counts())




--- Sampled Dataset Info ---
Total entries in sampled data: 750
Sampled counts per intent:
intent
bye               150
greeting          150
negative_reply    150
oos               150
positive_reply    150
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min(len(x), N_SAMPLES), random_state=42))


## Save Remapped Intents to CSV files

In [8]:
# Finally, we'll save each intent group to its own CSV file in the specified directory.
project_root = Path().cwd().parent
output_dir = str(project_root / "data" / "raw" / "intents")
os.makedirs(output_dir, exist_ok=True)

print(f"\nSaving files to '{output_dir}' directory...")

# Group the DataFrame by our new intent names
for intent_name, group_df in sampled_df.groupby('intent'):
    file_path = os.path.join(output_dir, f"{intent_name}.csv")
    
    # Select only the 'text' and 'intent' columns for saving
    save_df = group_df[['text', 'intent']]
    
    save_df.to_csv(file_path, index=False)
    print(f" -> Saved '{intent_name}.csv' with {len(save_df)} rows.")

print("\n✅ All files have been saved successfully.")


Saving files to 'c:\Users\aceto\Documents\GitHub\schedulebot-plus\data\raw\intents' directory...
 -> Saved 'bye.csv' with 150 rows.
 -> Saved 'greeting.csv' with 150 rows.
 -> Saved 'negative_reply.csv' with 150 rows.
 -> Saved 'oos.csv' with 150 rows.
 -> Saved 'positive_reply.csv' with 150 rows.

✅ All files have been saved successfully.
