<!-- ## Imports -->

In [35]:
import pandas as pd
import json
import os

from transformers import AutoTokenizer

In [36]:
# 1. Load the DSTC8 dialogues JSON file
with open('../../data/train/dialogues_004.json', 'r', encoding='utf-8') as f:
    dialogues = json.load(f)

# 2. Define target domains & intents for Services and Events
target_domains = ['Services_1', 'Services_2', 'Services_3', 'Events_1', 'Events_2']
target_intents = {
    'Services': ['BookAppointment', 'FindProvider'],
    'Events': ['FindEvents', 'BuyEventTickets', 'GetEventDates']
}

# 3. Prepare list to hold extracted examples
examples = []

for dialogue in dialogues:
    # Filter dialogues to only those with at least one target domain
    if not any(domain in dialogue['services'] for domain in target_domains):
        continue  # Skip dialogues outside our target domains
    
    for turn in dialogue['turns']:
        # Each turn can have multiple frames (one per service)
        frames = turn.get('frames', [])
        for frame in frames:
            service = frame.get('service')
            # Skip if service not in target domains
            if service not in target_domains:
                continue
            
            # Get dialog state info
            state = frame.get('state', {})
            intent = state.get('active_intent')
            if not intent:
                continue
            
            # Check if intent is in our target intents for this domain
            domain_key = 'Services' if 'Services' in service else 'Events'
            if intent not in target_intents[domain_key]:
                continue
            
            # Extract utterance text from the turn
            utterance = turn.get('utterance', '').strip()
            if not utterance:
                continue
            
            # Extract slot values from state.slot_values
            # slot_values is a dict with keys = slot names, values = list of strings
            slot_values = state.get('slot_values', {})
            # Convert slot values from list to single string (if exists)
            slots = {k: v[0] if isinstance(v, list) and len(v) > 0 else '' for k, v in slot_values.items()}
            
            # Store example
            examples.append({
                "text": utterance,
                "intent": intent,
                "slots": slots
            })

print(f"Extracted {len(examples)} examples from Services & Events domains.")

# Optional: view first 3 examples
for example in examples[:3]:
    print(json.dumps(example, indent=2))

Extracted 896 examples from Services & Events domains.
{
  "text": "I'm looking for events in NY, and heard the Yankees vs orioles is fun.",
  "intent": "GetEventDates",
  "slots": {
    "city": "NY",
    "event_name": "Yankees vs orioles"
  }
}
{
  "text": "Great.",
  "intent": "GetEventDates",
  "slots": {
    "city": "NY",
    "date": "today",
    "event_name": "Yankees vs Orioles"
  }
}
{
  "text": "Yes, one ticket please.",
  "intent": "BuyEventTickets",
  "slots": {
    "city": "NY",
    "date": "today",
    "event_name": "Yankees vs Orioles",
    "number_of_tickets": "1"
  }
}


In [37]:
# Load the schema file
with open('../../data/train/schema.json', 'r', encoding='utf-8') as f:
    schema = json.load(f)

# Set to store unique intent names
unique_intents = set()

# Process and print intents per service
for service in schema:
    service_name = service['service_name']
    intents = service.get('intents', [])
    
    print(f'\n=== {service_name} ({len(intents)} intents) ===')
    for intent in intents:
        intent_name = intent['name']
        unique_intents.add(intent_name)
        print(f"- {intent_name}")

# Print total number of unique intents
print(f'\nTotal unique intents across all services: {len(unique_intents)}')


=== Banks_1 (2 intents) ===
- CheckBalance
- TransferMoney

=== Buses_1 (2 intents) ===
- FindBus
- BuyBusTicket

=== Buses_2 (2 intents) ===
- FindBus
- BuyBusTicket

=== Calendar_1 (3 intents) ===
- GetEvents
- GetAvailableTime
- AddEvent

=== Events_1 (2 intents) ===
- FindEvents
- BuyEventTickets

=== Events_2 (3 intents) ===
- FindEvents
- GetEventDates
- BuyEventTickets

=== Flights_1 (4 intents) ===
- SearchOnewayFlight
- SearchRoundtripFlights
- ReserveOnewayFlight
- ReserveRoundtripFlights

=== Flights_2 (2 intents) ===
- SearchOnewayFlight
- SearchRoundtripFlights

=== Homes_1 (2 intents) ===
- FindApartment
- ScheduleVisit

=== Hotels_1 (2 intents) ===
- ReserveHotel
- SearchHotel

=== Hotels_2 (2 intents) ===
- BookHouse
- SearchHouse

=== Hotels_3 (2 intents) ===
- ReserveHotel
- SearchHotel

=== Media_1 (2 intents) ===
- FindMovies
- PlayMovie

=== Movies_1 (3 intents) ===
- BuyMovieTickets
- FindMovies
- GetTimesForMovie

=== Music_1 (2 intents) ===
- LookupSong
- PlayS

In [38]:
# Define the target domains/services
target_domains = {'Services_1', 'Services_2', 'Services_3', 'Events_1', 'Events_2'}

# Directory path where your DSTC8 training data is stored
data_dir = '../../data/train'
output_path = '../../data/filtered_dialogues.json'

# Function to check if a dialogue is in the target domains
def dialogue_in_target_domains(dialogue, target_domains):
    return any(service in target_domains for service in dialogue['services'])

# Get all files matching pattern dialogues_*.json
dialogue_files = [f for f in os.listdir(data_dir) if f.startswith('dialogues_') and f.endswith('.json')]

filtered_dialogues = []

# Process each dialogue file
for file in dialogue_files:
    file_path = os.path.join(data_dir, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        dialogues = json.load(f)
        for dialogue in dialogues:
            if dialogue_in_target_domains(dialogue, target_domains):
                filtered_dialogues.append(dialogue)

# Save the filtered dialogues to a new JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(filtered_dialogues, f, indent=2)

print(f"✅ Filtered {len(filtered_dialogues)} dialogues saved to: {output_path}")

✅ Filtered 5398 dialogues saved to: ../../data/filtered_dialogues.json


In [39]:
with open('../../data/filtered_dialogues.json', 'r', encoding='utf-8') as f:
    filtered_dialogues = json.load(f)

target_domains = {'Services_1', 'Services_2', 'Services_3', 'Events_1', 'Events_2'}
examples = []

for dialogue in filtered_dialogues:
    for turn in dialogue['turns']:
        if turn['speaker'] == 'USER' and turn['frames']:
            for frame in turn['frames']:
                # Keep only the turns for your selected services/domains
                if frame.get('service') in target_domains:
                    intent = frame.get('state', {}).get('active_intent')
                    utterance = turn.get('utterance')
                    if intent and utterance:
                        examples.append((utterance, intent))



print(f"Total extracted examples: {len(examples)}")

print(examples[:5])  # Print first 5 examples for verification

Total extracted examples: 39081
[("Ok, It's fine. I would like to do something interesting", 'FindEvents'), ("I'd like a Concert", 'FindEvents'), ('Where is the place?', 'FindEvents'), ('What kind of concert is it? What time does it start?', 'FindEvents'), ('Ok, thanks for these information.', 'FindEvents')]


In [40]:
for i, example in enumerate(examples[:10]):
    print(f"Example {i}: type={type(example)}, value={example}")

Example 0: type=<class 'tuple'>, value=("Ok, It's fine. I would like to do something interesting", 'FindEvents')
Example 1: type=<class 'tuple'>, value=("I'd like a Concert", 'FindEvents')
Example 2: type=<class 'tuple'>, value=('Where is the place?', 'FindEvents')
Example 3: type=<class 'tuple'>, value=('What kind of concert is it? What time does it start?', 'FindEvents')
Example 4: type=<class 'tuple'>, value=('Ok, thanks for these information.', 'FindEvents')
Example 5: type=<class 'tuple'>, value=('Not now, thanks. I need a house to stay there, a house with rating 4.1 or higher, for 1 people, with laundry service.', 'NONE')
Example 6: type=<class 'tuple'>, value=("That sounds great. I'd also like to catch a pop event while I'm there, preferably on the 2nd.", 'FindEvents')
Example 7: type=<class 'tuple'>, value=('A concert would be music to my ears.', 'FindEvents')
Example 8: type=<class 'tuple'>, value=('Can you find me something with an international flavor?', 'FindEvents')
Exampl

In [41]:
cleaned_examples = [ex for ex in examples if isinstance(ex, (list, tuple)) and len(ex) == 2]
print(f"Cleaned examples count: {len(cleaned_examples)}")

Cleaned examples count: 39081


In [42]:
df = pd.DataFrame(cleaned_examples, columns=['Utterance', 'Intent'])
df.head()

Unnamed: 0,Utterance,Intent
0,"Ok, It's fine. I would like to do something in...",FindEvents
1,I'd like a Concert,FindEvents
2,Where is the place?,FindEvents
3,What kind of concert is it? What time does it ...,FindEvents
4,"Ok, thanks for these information.",FindEvents


In [43]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")


Train: 31264, Validation: 3908, Test: 3909


In [44]:
intents = sorted(df['Intent'].unique())
intent2id = {intent: i for i, intent in enumerate(intents)}
id2intent = {i: intent for intent, i in intent2id.items()}

train_df['IntentID'] = train_df['Intent'].map(intent2id)
val_df['IntentID'] = val_df['Intent'].map(intent2id)
test_df['IntentID'] = test_df['Intent'].map(intent2id)

In [45]:
test_df.head()

Unnamed: 0,Utterance,Intent,IntentID
7725,That sounds great for me.,FindEvents,2
23427,"Hi, i need a help, i am looking or a doctor in...",FindProvider,3
26153,What's the address?,FindEvents,2
21299,"Great, when am I free on that day?",FindProvider,3
10031,Are there any other events?,FindEvents,2


In [46]:
test_df['Intent'].unique()

array(['FindEvents', 'FindProvider', 'BookAppointment', 'BuyEventTickets',
       'NONE', 'GetEventDates'], dtype=object)

In [47]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_batch(batch):
    return tokenizer(
        batch['Utterance'].tolist(), 
        padding='max_length', 
        truncation=True,
        max_length=32  # adjust as needed
    )

train_encodings = tokenize_batch(train_df)
val_encodings = tokenize_batch(val_df)
test_encodings = tokenize_batch(test_df)

In [48]:
from datasets import Dataset

def build_hf_dataset(encodings, labels):
    # Build dictionary for HuggingFace Dataset
    encodings.update({'labels': labels})
    return Dataset.from_dict(encodings)

# Convert your intent labels (e.g., train_df['Label']) to list or numpy array
train_dataset = build_hf_dataset(train_encodings, train_df['Intent'].tolist())
val_dataset = build_hf_dataset(val_encodings, val_df['Intent'].tolist())
test_dataset = build_hf_dataset(test_encodings, test_df['Intent'].tolist())


In [50]:
intent2id = {intent: idx for idx, intent in enumerate(sorted(df['Intent'].unique()))}

In [54]:
# Assuming your original label column still contains strings in train_df/val_df
train_labels = train_df['Intent'].map(intent2id).tolist()
val_labels = val_df['Intent'].map(intent2id).tolist()
test_labels = test_df['Intent'].map(intent2id).tolist()


In [55]:
train_inputs = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels  # <-- Now integer IDs!
}
val_inputs = {
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
}

from datasets import Dataset
train_dataset = Dataset.from_dict(train_inputs)
val_dataset = Dataset.from_dict(val_inputs)


In [59]:
print(train_dataset[0])

{'input_ids': [101, 3531, 2191, 2033, 2019, 6098, 2000, 1996, 24385, 2006, 2008, 2154, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 0}


In [60]:
from transformers import Trainer, TrainingArguments

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(intent2id)  # intent2id maps label strings to integers
)

args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5054,0.47135
2,0.4299,0.45027
3,0.3742,0.455806




TrainOutput(global_step=5862, training_loss=0.46011358786263756, metrics={'train_runtime': 694.5545, 'train_samples_per_second': 135.039, 'train_steps_per_second': 8.44, 'total_flos': 1542412399325184.0, 'train_loss': 0.46011358786263756, 'epoch': 3.0})

In [61]:
# from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# model = AutoModelForSequenceClassification.from_pretrained(
#     'bert-base-uncased', 
#     num_labels=len(intent2id)
# )
# args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     learning_rate=2e-5,
# )

# # You would need a Dataset class or Hugging Face Datasets integration for this
# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train_df,
#     eval_dataset=val_df,
# )

# trainer.train()


In [52]:
print(train_dataset[0])

{'input_ids': [101, 3531, 2191, 2033, 2019, 6098, 2000, 1996, 24385, 2006, 2008, 2154, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 'BookAppointment'}


In [62]:
results = trainer.evaluate(test_dataset)
print(results)

{'eval_runtime': 6.625, 'eval_samples_per_second': 590.037, 'eval_steps_per_second': 36.981, 'epoch': 3.0}


In [68]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [69]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,        # your TrainingArguments
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # or test_dataset
    compute_metrics=compute_metrics
)

In [71]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.455806165933609, 'eval_model_preparation_time': 0.0017, 'eval_accuracy': 0.8142272262026612, 'eval_precision': 0.8135234245118433, 'eval_recall': 0.8142272262026612, 'eval_f1': 0.8129443898121915, 'eval_runtime': 7.1101, 'eval_samples_per_second': 549.64, 'eval_steps_per_second': 34.458}


In [72]:
predictions = trainer.predict(val_dataset)
y_pred = predictions.predictions.argmax(axis=-1)
y_true = predictions.label_ids

