# Data Preprocessing for Slot Filling (NER)

In [1]:
import pandas as pd
import ast
import json

import os

os.chdir("../../")


from slot_extraction.utilities.slots import filter_dataset, construct_slot_extraction_data
from datasets import DatasetDict, load_dataset

dataset = load_dataset("multi_woz_v22")

filtered_train = filter_dataset(dataset['train'])
slot_data_train = construct_slot_extraction_data(filtered_train)

filtered_validation = filter_dataset(dataset['validation'])
slot_data_validation = construct_slot_extraction_data(filtered_validation)

filtered_test = filter_dataset(dataset['test'])
slot_data_test = construct_slot_extraction_data(filtered_test)

  from .autonotebook import tqdm as notebook_tqdm
2023-12-18 19:57:59.637843: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 19:58:00.581117: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 19:58:00.586486: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
def label_words_csv(row):
    entities = row['values']
    text = row['utterance']
    words = text.split()

    # Initialize labels for each word as 'O'
    word_labels = ['O'] * len(words)

    for entity, (value, char_span) in entities.items():
        if char_span is None or None in char_span:
            continue

        # Convert character span to word span
        span_start = text[:char_span[0]].count(' ')
        span_end = text[:char_span[1]].count(' ')

        # Assign labels to the words within the span
        if span_start < len(words):
            word_labels[span_start] = 'B-' + entity
            for i in range(span_start + 1, min(span_end + 1, len(words))):
                word_labels[i] = 'I-' + entity

    # Convert the list of labels to a string representation
    label_string = str(word_labels)
    return {'text': ' '.join(words), 'labels': label_string}

In [10]:
from transformers import AutoTokenizer
import ast

# Initialize the tokenizer (use the same model as your training script)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def label_tokens_json(row):
    text = row['utterance']
    entities = row['values']

    # Tokenize the text
    tokenized_input = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
    tokens = tokenized_input.tokens()
    offset_mapping = tokenized_input["offset_mapping"]

    # Initialize labels for each token as 'O'
    token_labels = ['O'] * len(tokens)

    for entity, (value, char_span) in entities.items():
        if char_span is None or None in char_span:
            continue

        entity_start, entity_end = char_span

        # Find tokens covered by the span and assign them labels
        for idx, (start, end) in enumerate(offset_mapping):
            if start >= entity_end:
                break
            if end > entity_start:
                if start == entity_start:
                    token_labels[idx] = 'B-' + entity
                else:
                    token_labels[idx] = 'I-' + entity

    # Merge subwords and their labels into words
    merged_tokens = []
    merged_labels = []
    buffer_token = ""
    buffer_label = ""

    for token, label in zip(tokens, token_labels):
        if token.startswith("##"):
            buffer_token += token[2:]  # Remove '##' and append
        else:
            if buffer_token:
                # Append the previous buffered token
                merged_tokens.append(buffer_token)
                merged_labels.append(buffer_label)
                buffer_token = ""
                buffer_label = ""
            buffer_token = token
            buffer_label = label

    # Append the last buffered token
    if buffer_token:
        merged_tokens.append(buffer_token)
        merged_labels.append(buffer_label)

    return {'text': merged_tokens, 'ner_tags': merged_labels}

In [11]:
label_tokens_json(slot_data_train.iloc[0])

{'text': ['i',
  'need',
  'a',
  'place',
  'to',
  'dine',
  'in',
  'the',
  'center',
  'thats',
  'expensive'],
 'ner_tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-restaurant-area',
  'O',
  'B-restaurant-pricerange']}

## Training

### A) CSV

In [13]:
# Apply the labeling function
labeled_data = slot_data_train.apply(label_words_csv, axis=1, result_type='expand')

In [14]:
labeled_data

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels
dialogue_id,turn_id,Unnamed: 2_level_1,Unnamed: 3_level_1
PMUL4398.json,0,i need a place to dine in the center thats exp...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-re..."
PMUL4398.json,2,"Any sort of food would be fine, as long as it ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
PMUL4398.json,4,"Sounds good, could I get that phone number? Al...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
PMUL4398.json,6,Yes. Can you book it for me?,"['O', 'O', 'O', 'O', 'O', 'O', 'O']"
PMUL4398.json,8,i want to book it for 2 people and 2 nights st...,"['O', 'O', 'O', 'O', 'O', 'O', 'B-hotel-bookst..."
...,...,...,...
WOZ20469.json,0,"Hello, I am looking for a cheap restaurant tha...","['O', 'O', 'O', 'O', 'O', 'O', 'B-restaurant-p..."
WOZ20469.json,2,"Yes, how about portuguese food?","['O', 'O', 'O', 'B-restaurant-food', 'O']"
WOZ20469.json,4,It doesn't matter.,"['O', 'B-restaurant-area', 'I-restaurant-area']"
WOZ20469.json,6,"I would like the address of Nandos restaurant,...","['O', 'O', 'O', 'O', 'O', 'O', 'B-restaurant-n..."


In [15]:
# Save to CSV
labeled_data.to_csv('train_file.csv', index=False)

### B) JSON

In [16]:
# Apply the labeling function and get a list of JSON strings
labeled_data_json_strings = slot_data_train.apply(label_tokens_json, axis=1)

In [17]:
labeled_data_json_strings

dialogue_id    turn_id
PMUL4398.json  0          {'text': ['i', 'need', 'a', 'place', 'to', 'di...
               2          {'text': ['any', 'sort', 'of', 'food', 'would'...
               4          {'text': ['sounds', 'good', ',', 'could', 'i',...
               6          {'text': ['yes', '.', 'can', 'you', 'book', 'i...
               8          {'text': ['i', 'want', 'to', 'book', 'it', 'fo...
                                                ...                        
WOZ20469.json  0          {'text': ['hello', ',', 'i', 'am', 'looking', ...
               2          {'text': ['yes', ',', 'how', 'about', 'portugu...
               4          {'text': ['it', 'doesn', ''', 't', 'matter', '...
               6          {'text': ['i', 'would', 'like', 'the', 'addres...
               8          {'text': ['thank', 'you', ',', 'goodbye', '.']...
Length: 33120, dtype: object

In [18]:
# Write the JSON strings to a file
with open('train_file.json', 'w') as f:
    for item in labeled_data_json_strings:
        json_string = json.dumps(item)  # Convert the dictionary to a JSON string
        f.write(json_string + '\n')

## Validation

### A) CSV

In [19]:
# Apply the labeling function
labeled_data = slot_data_validation.apply(label_words_csv, axis=1, result_type='expand')

In [20]:
labeled_data

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels
dialogue_id,turn_id,Unnamed: 2_level_1,Unnamed: 3_level_1
PMUL0698.json,0,I'm looking for a local place to dine in the c...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
PMUL0698.json,2,"I need the address, postcode and the price range.","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
PMUL0698.json,10,"No, this is all I will need. Thank you.","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
PMUL3233.json,0,My husband and I are celebrating our anniversa...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
PMUL3233.json,2,I would like a 4 star guesthouse that includes...,"['O', 'O', 'O', 'O', 'B-hotel-stars', 'O', 'B-..."
...,...,...,...
PMUL0077.json,2,Just someting in the moderate price range is a...,"['O', 'O', 'O', 'O', 'B-restaurant-pricerange'..."
PMUL0077.json,4,How about modern European food?,"['O', 'O', 'B-restaurant-food', 'I-restaurant-..."
PMUL0077.json,6,Sounds good can you make a booking for 7 peopl...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-re..."
PMUL0077.json,8,I also need to get some information about The ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [21]:
# Save to CSV
labeled_data.to_csv('validation_file.csv', index=False)

### B) JSON

In [22]:
# Apply the labeling function and get a list of JSON strings
labeled_data_json_strings = slot_data_validation.apply(label_tokens_json, axis=1)

In [23]:
labeled_data_json_strings

dialogue_id    turn_id
PMUL0698.json  0          {'text': ['i', ''', 'm', 'looking', 'for', 'a'...
               2          {'text': ['i', 'need', 'the', 'address', ',', ...
               10         {'text': ['no', ',', 'this', 'is', 'all', 'i',...
PMUL3233.json  0          {'text': ['my', 'husband', 'and', 'i', 'are', ...
               2          {'text': ['i', 'would', 'like', 'a', '4', 'sta...
                                                ...                        
PMUL0077.json  2          {'text': ['just', 'someting', 'in', 'the', 'mo...
               4          {'text': ['how', 'about', 'modern', 'european'...
               6          {'text': ['sounds', 'good', 'can', 'you', 'mak...
               8          {'text': ['i', 'also', 'need', 'to', 'get', 's...
               10         {'text': ['not', 'at', 'this', 'time', ',', 't...
Length: 3986, dtype: object

In [26]:
# Write the JSON strings to a file
with open('validation_file.json', 'w') as f:
    for item in labeled_data_json_strings:
        json_string = json.dumps(item)  # Convert the dictionary to a JSON string
        f.write(json_string + '\n')

## Testing

### A) CSV

In [27]:
# Apply the labeling function
labeled_data = slot_data_test.apply(label_words_csv, axis=1, result_type='expand')

In [28]:
labeled_data

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels
dialogue_id,turn_id,Unnamed: 2_level_1,Unnamed: 3_level_1
PMUL4462.json,0,"Hello, I am looking for a restaurant in Cambri...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
PMUL4462.json,2,Can you book me a table for 11:00 on Friday?,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-restaur..."
PMUL4462.json,4,"Actually, for 4, please.","['O', 'O', 'B-restaurant-bookpeople', 'O']"
PMUL4462.json,18,"Thanks for the service, good day.","['O', 'O', 'O', 'O', 'O', 'O']"
PMUL0320.json,0,"Hi, I'm looking for a hotel to stay in that in...","['O', 'O', 'O', 'O', 'O', 'B-hotel-type', 'O',..."
...,...,...,...
MUL0641.json,8,"Ok, is there one that you would recommend?","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
MUL0641.json,12,Where is the hotel located?,"['O', 'O', 'O', 'O', 'O']"
MUL0641.json,14,What area of town is the hotel in?,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
MUL0641.json,16,Does it have free parking?,"['O', 'O', 'O', 'O', 'O']"


In [29]:
# Save to CSV
labeled_data.to_csv('test_file.csv', index=False)

### B) JSON

In [30]:
# Apply the labeling function and get a list of JSON strings
labeled_data_json_strings = slot_data_test.apply(label_tokens_json, axis=1)

In [31]:
labeled_data_json_strings

dialogue_id    turn_id
PMUL4462.json  0          {'text': ['hello', ',', 'i', 'am', 'looking', ...
               2          {'text': ['can', 'you', 'book', 'me', 'a', 'ta...
               4          {'text': ['actually', ',', 'for', '4', ',', 'p...
               18         {'text': ['thanks', 'for', 'the', 'service', '...
PMUL0320.json  0          {'text': ['hi', ',', 'i', ''', 'm', 'looking',...
                                                ...                        
MUL0641.json   8          {'text': ['ok', ',', 'is', 'there', 'one', 'th...
               12         {'text': ['where', 'is', 'the', 'hotel', 'loca...
               14         {'text': ['what', 'area', 'of', 'town', 'is', ...
               16         {'text': ['does', 'it', 'have', 'free', 'parki...
               22         {'text': ['no', ',', 'that', 'would', 'be', 'a...
Length: 3940, dtype: object

In [32]:
# Write the JSON strings to a file
with open('test_file.json', 'w') as f:
    for item in labeled_data_json_strings:
        json_string = json.dumps(item)  # Convert the dictionary to a JSON string
        f.write(json_string + '\n')