In [8]:
import pandas as pd
from datasets import load_dataset, ClassLabel
from pathlib import Path

In [9]:
# Load the CLINIC150-sur dataset from Hugging Face
full_dataset = load_dataset("ibm-research/clinic150-sur", split="train")  # 'train' is the only split and contains all data

# Display basic info about the dataset
print(full_dataset)
print("Features:", full_dataset.features)
print("First example:", full_dataset[0])


Dataset({
    features: ['intent', 'user_utterance', 'origin'],
    num_rows: 600000
})
Features: {'intent': Value('string'), 'user_utterance': Value('string'), 'origin': Value('string')}
First example: {'intent': 'accept_reservations', 'user_utterance': 'am i able to make reservations at spago in beverly hills', 'origin': 'original'}


In [10]:
# Define intent mapping
# We map a subset of the original intents to our custom intents
INTENT_MAP = {
    # "avail" -> check availability
    "calendar": "avail",
    "how_busy": "avail",
    "date": "avail",
    "meeting_schedule": "avail",
    
	# "book" -> book an appointment
    "accept_reservation": "book",
    "schedule_meeting": "book",
    "yes": "book",
    
	# "bye" -> end conversation
    "goodbye": "bye",
    "thank_you": "bye",
    
	# "cancel" -> cancel an appointment
    "cancel_reservation": "cancel",
    "no": "cancel",
    
	# "greet" -> start conversation
    "greeting": "greet",
    
	# "resched" -> reschedule an appointment
    "calendar_update": "resched",
    "reminder_update": "resched"
}

In [11]:
# Filter the dataset to keep only examples whose intent is in INTENT_MAP
filtered_dataset = full_dataset.filter(
	lambda example: example['intent'] in INTENT_MAP
)
print(f"Filtered dataset size: {len(filtered_dataset)}")

Filtered dataset size: 97891


In [12]:
# Apply the INTENT_MAP to create a new column 'new_intent' in filtered_dataset
def map_intent(example):
	return {'new_intent': INTENT_MAP[example['intent']]}

processed_dataset = filtered_dataset.map(map_intent)
print("Features:", processed_dataset.features)

Features: {'intent': Value('string'), 'user_utterance': Value('string'), 'origin': Value('string'), 'new_intent': Value('string')}


In [14]:
# Get unique new_intent values and sort them for consistent label assignment
unique_intents = sorted(set(processed_dataset['new_intent']))
new_intent_to_id = {intent: idx for idx, intent in enumerate(unique_intents)}

# Create a ClassLabel feature
class_label = ClassLabel(names=unique_intents)

# Add 'label' column by mapping 'new_intent' to its integer id
def encode_label(example):
	return {'label': new_intent_to_id[example['new_intent']]}

final_dataset = processed_dataset.map(encode_label)
final_dataset = final_dataset.cast_column('label', class_label)
print("Features:", final_dataset.features)
print("First example:", final_dataset[0])
print("New intents and their IDs:", new_intent_to_id)

Features: {'intent': Value('string'), 'user_utterance': Value('string'), 'origin': Value('string'), 'new_intent': Value('string'), 'label': ClassLabel(names=['avail', 'book', 'bye', 'cancel', 'greet', 'resched'])}
First example: {'intent': 'calendar', 'user_utterance': 'anything on the schedule for october 14th', 'origin': 'original', 'new_intent': 'avail', 'label': 0}
New intents and their IDs: {'avail': 0, 'book': 1, 'bye': 2, 'cancel': 3, 'greet': 4, 'resched': 5}


In [15]:
# We will split the dataset into training and testing sets (80/20 split)
# First split: 15% test, 85% train+val
split_1 = final_dataset.train_test_split(test_size=0.15, stratify_by_column='label', seed=42)

# Second split: from train+val, 15/85 ≈ 0.176 for validation (so val is 15% of total)
split_2 = split_1['train'].train_test_split(test_size=0.176, stratify_by_column='label', seed=42)

# Assemble splits
data_split = {
    'train': split_2['train'],
    'val': split_2['test'],
    'test': split_1['test']
}

# Convert to pandas DataFrame for easy saving to CSV
train_df = data_split['train'].to_pandas()
val_df = data_split['val'].to_pandas()
test_df = data_split['test'].to_pandas()


In [16]:
def print_class_distribution(df, label_col):
    """
    Print the class distribution for a given label column in a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        label_col (str): The name of the column containing class labels.

    Prints:
        A DataFrame showing the count and percentage of each class.
    """
    # Count the occurrences of each class
    counts = df[label_col].value_counts(dropna=False)
    # Calculate the percentage of each class
    percentages = df[label_col].value_counts(normalize=True, dropna=False) * 100
    # Display the results as a DataFrame
    print(pd.DataFrame({'count': counts, 'percent': percentages.round(2)}))

In [20]:
# Disolay the sizes of the splits
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Display class distributions for each split
print(f"\nTrain set class distribution:")
print_class_distribution(train_df, 'new_intent')
print(f"\nValidation set class distribution:")
print_class_distribution(val_df, 'new_intent')
print(f"\nTest set class distribution:")
print_class_distribution(test_df, 'new_intent')

Train: 68562, Val: 14645, Test: 14684

Train set class distribution:
            count  percent
new_intent                
bye         18278    26.66
book        13938    20.33
cancel      11804    17.22
avail       11514    16.79
greet       10180    14.85
resched      2848     4.15

Validation set class distribution:
            count  percent
new_intent                
bye          3904    26.66
book         2977    20.33
cancel       2521    17.21
avail        2459    16.79
greet        2175    14.85
resched       609     4.16

Test set class distribution:
            count  percent
new_intent                
bye          3915    26.66
book         2985    20.33
cancel       2528    17.22
avail        2466    16.79
greet        2180    14.85
resched       610     4.15


In [18]:
# Get the project root (parent of the current notebook)
project_root = Path().cwd().resolve().parent
output_dir = project_root / "data" / "processed"

# Save the files
train_df.to_csv(output_dir / "train.csv", index=False)
val_df.to_csv(output_dir / "val.csv", index=False)
test_df.to_csv(output_dir / "test.csv", index=False)

print(f"Processed data saved to {output_dir}")

Processed data saved to C:\Users\aceto\Documents\GitHub\schedulebot-plus\data\processed
