In [1]:
from datasets import load_dataset, Features, ClassLabel, Value, Sequence
from transformers import AutoTokenizer
import os
from pathlib import Path

## Tokenizer

In [2]:
# Make sure the tokenizer match the model that will be fine-tuned.
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Load the datasets
This cell loads the generated `train.jsonl`, `validation.jsonl`, and `test.jsonl` files.

In [3]:
project_root = str(Path().cwd().resolve().parent)
dataset_dir = os.path.join(project_root, "data", "raw")

data_files = {
    "train": os.path.join(dataset_dir, "train.jsonl"),
    "validation": os.path.join(dataset_dir, "validation.jsonl"),
    "test": os.path.join(dataset_dir, "test.jsonl")
}
raw_datasets = load_dataset("json", data_files=data_files)

print(raw_datasets)
print("\nExample from training set:")
print(raw_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'intent', 'entities'],
        num_rows: 2045
    })
    validation: Dataset({
        features: ['text', 'intent', 'entities'],
        num_rows: 438
    })
    test: Dataset({
        features: ['text', 'intent', 'entities'],
        num_rows: 439
    })
})

Example from training set:
{'text': 'what is albert einstein famous for', 'intent': 'oos', 'entities': []}


## Create Label Mappings
Now, we create the mappings from string labels (e.g., "schedule", "practitioner_name") to integer IDs. This is essential for training. We also need to create tags for the BIO (Beginning, Inside, Outside) entity scheme.

In [4]:
# --- Create Intent Label Mappings ---
# Get all unique intent labels from the training data
intent_labels = raw_datasets['train'].unique('intent')
intent_labels.sort() # Sort for consistency
id2intent = {i: label for i, label in enumerate(intent_labels)}
intent2id = {label: i for i, label in enumerate(intent_labels)}
print(f"Intent mapping (intent2id): {intent2id}\n")


# --- Create Entity (NER) Label Mappings in BIO format ---
# Get all unique entity labels
entity_labels = ["appointment_id", "appointment_type", "practitioner_name"]
# Create the full list of BIO tags
ner_tags = ["O"] # 'O' for tokens outside any entity
for label in entity_labels:
    ner_tags.append(f"B-{label}") # 'B' for Beginning of an entity
    ner_tags.append(f"I-{label}") # 'I' for Inside of an entity

id2ner = {i: label for i, label in enumerate(ner_tags)}
ner2id = {label: i for i, label in enumerate(ner_tags)}
print(f"NER mapping (ner2id): {ner2id}")

Intent mapping (intent2id): {'bye': 0, 'cancel': 1, 'greeting': 2, 'negative_reply': 3, 'oos': 4, 'positive_reply': 5, 'query_avail': 6, 'reschedule': 7, 'schedule': 8}

NER mapping (ner2id): {'O': 0, 'B-appointment_id': 1, 'I-appointment_id': 2, 'B-appointment_type': 3, 'I-appointment_type': 4, 'B-practitioner_name': 5, 'I-practitioner_name': 6}


## Preprocessing function
This is the core function. It takes a single data example and does two things:
1. Tokenizes the text.
2. Aligns character-based entity spans (`start`, `end`) with the new wordpiece tokens, assigning the correct BIO tag ID to each token.

In [5]:
def preprocess_function(examples):
    # --- Intent Processing ---
    intent_ids = [intent2id[intent] for intent in examples['intent']]

    # --- Tokenization ---
    tokenized_inputs = tokenizer(examples['text'], truncation=True, is_split_into_words=False, return_offsets_mapping=True)

    # --- Entity (NER) Label Alignment ---
    ner_labels = []
    for i, entities in enumerate(examples['entities']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [ner2id["O"]] * len(word_ids)
        
        # For each entity, find the corresponding tokens and assign B- and I- tags
        for entity in entities:
            start_char, end_char, label = entity['start'], entity['end'], entity['label']
            
            # This flag tracks if we've found the first token of the current entity
            first_token_of_entity_found = False
            
            for j, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                
                token_char_span = tokenized_inputs['offset_mapping'][i][j]
                token_start, token_end = token_char_span
                
                # Check if the token is part of the entity
                if start_char < token_end and end_char > token_start:
                    # This is the key change. We use the flag to decide the tag.
                    if not first_token_of_entity_found:
                        # This is the first token of the entity, assign the 'B-' tag
                        label_ids[j] = ner2id[f"B-{label}"]
                        first_token_of_entity_found = True
                    else:
                        # This is a subsequent token of the same entity, assign 'I-'
                        label_ids[j] = ner2id[f"I-{label}"]

        ner_labels.append(label_ids)

    # Add the final processed labels to our tokenized inputs
    tokenized_inputs["intent_label"] = intent_ids
    tokenized_inputs["labels"] = ner_labels
    
    # Remove offset_mapping as it's not needed by the model
    tokenized_inputs.pop("offset_mapping", None)
    
    return tokenized_inputs

## Apply Preprocessing and Save
Now we apply this function to our entire dataset and save the final, processed version. This is what we will load directly in the fine-tuning script.

In [6]:
# Apply the function to all splits of the dataset
processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets['train'].column_names)

# Define the features for our processed dataset, including the new ClassLabels
features = Features({
    'input_ids': Sequence(Value('int64')),
    'attention_mask': Sequence(Value('int8')),
    'intent_label': ClassLabel(names=list(intent2id.keys())),
    'labels': Sequence(ClassLabel(names=list(ner2id.keys())))
})

# Cast the processed datasets to the defined features to include the label names
processed_datasets = processed_datasets.cast(features)

# Save the processed dataset locally
output_dir = os.path.join(project_root, "data", "processed", "hasd_processed")
processed_datasets.save_to_disk(output_dir)

print("\nProcessed dataset saved successfully!")
print(processed_datasets)
print(processed_datasets['train'].features)

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2045 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/438 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/439 [00:00<?, ? examples/s]


Processed dataset saved successfully!
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'intent_label', 'labels'],
        num_rows: 2045
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'intent_label', 'labels'],
        num_rows: 438
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'intent_label', 'labels'],
        num_rows: 439
    })
})
{'input_ids': List(Value('int64')), 'attention_mask': List(Value('int8')), 'intent_label': ClassLabel(names=['bye', 'cancel', 'greeting', 'negative_reply', 'oos', 'positive_reply', 'query_avail', 'reschedule', 'schedule']), 'labels': List(ClassLabel(names=['O', 'B-appointment_id', 'I-appointment_id', 'B-appointment_type', 'I-appointment_type', 'B-practitioner_name', 'I-practitioner_name']))}
