In [18]:
import json
import re


ENTITY_REGEX = re.compile(
    r"\[(?P<entity_text>[^\]]+?)\](\((?P<entity>[^:)]+?)(?:\:(?P<value>[^)]+))?\)|\{(?P<entity_dict>[^}]+?)\}|\[(?P<list_entity_dicts>.*?)\])"  # noqa: E501, W505
)

def extract_entities(text):
    entities = []
    offset = 0 
    
    for match in re.finditer(ENTITY_REGEX, text):
        entity_text = match.groupdict()["entity_text"]
        entity_dict_str = match.groupdict()["entity_dict"]
        entity_dict = json.loads(f"{{{entity_dict_str}}}")
        entity_type = entity_dict["entity"]
        start_index = match.start() - offset
        end_index = start_index + len(entity_text)
        offset += len(match.group(0)) - len(entity_text)

        entities.append({
            "start_span": start_index,
            "end_span": end_index,
            "value": entity_text,
            "entity": entity_type
        })
    return entities

def convert_plain_text(text):
    plain_text = re.sub(ENTITY_REGEX, lambda m: m.groupdict()["entity_text"], text)

    return plain_text

In [19]:
import json
def save_json(data_dict, name = "data_nlu"):
    with open(f"{name}.json", "w", encoding='utf-8') as f:
        json.dump(data_dict, f, indent= 4)

def load_json(path = "./data_nlu.json"):
    with open(f"{path}", "r", encoding="utf-8") as f:
        data = json.load(f)
    return data



In [20]:
from ruamel import yaml as yaml
filename = './data/nlu_cursos_version.yml'

def read_yaml_file(filename):
    with open(filename, encoding= "utf-8") as f:
        content = f.read()
    
    yaml_parser = yaml.YAML(typ = 'safe')
    yaml_parser.version = (1, 2)
    yaml_parser.preserve_quotes = True
    data_nlu = yaml_parser.load(content)

    data_intents = []
    for examples_nlu in  data_nlu['nlu']:
        intent = examples_nlu['intent']  
        examples = examples_nlu['examples']
        examples_intent = []
        for example in examples.splitlines():
            examples_intent.append(example[1:].strip( "\n\r "))
        
        data_intents.append({'intent': intent, 'examples': examples_intent})
    return data_intents

intent_examples = read_yaml_file(filename)


data = []

for intent in intent_examples:
    intent_name = intent['intent']
    examples = intent['examples']
    if len(examples) < 10:
        continue
    for example in examples:
        entities = extract_entities(example)
        text = convert_plain_text(example)
        data.append({
            "text": text,
            "intent_name": intent_name,
            "entities": entities

        })

save_json(data, name = "data/data_nlu")

In [21]:
from sklearn.model_selection import train_test_split

intents_labels = [sample['intent_name'] for sample in data]

train_data , test_data = train_test_split(data, random_state=42, test_size= 0.2, shuffle= True, stratify= intents_labels)
print("train size:", len(train_data))

print("test size:", len(test_data))

save_json(train_data, name = "data/train_nlu")

save_json(test_data, name = "data/test_nlu")

train size: 1378
test size: 345
