# Tokenization

In [48]:
# General
import numpy as np
import pandas as pd




from nltk.tokenize import regexp_tokenize


'''  

We also introduce a “divide, flatten, and conquer” approach that uses a pipeline of two models run
in sequence: an intent segmentation (IS) model to determine the intent spans present in the input
utterance, and a conventional named entity recognition (NER) model to assign flattened entity labels
to the tokens of each intent span identified by the first model. These models are trained separately on
two different sequence labeling tasks. For example in two large pizzas with ham and one diet coke,
the labeling for the intent segmentation model would be:
B-PIZZAORDER I-PIZZAORDER I-PIZZAORDER
I-PIZZAORDER I-PIZZAORDER Other
B-DRINKORDER I-DRINKORDER I-DRINKORDER.
The example carries two different intent spans, one for a pizza order and one for a drink order. As a
result, the NER model will have two different inputs: two large pizzas with ham and one diet coke.
The NER labels for each will be: B-NUMBER B-SIZE Other Other B-TOPPING
and B-NUMBER B-DRINKTYPE I-DRINKTYPE.
To handle cases where hierarchy is needed, we compress more information into flattened labels, e.g.,
a negated pizza topping will be labeled as NEG_TOPPING. Because these models need to have a 1:1
mapping between the input tokens and output labels, they can’t be trained on the EXR notation or the
TOP-Decoupled notation. Both models use a BERT-based encoder Devlin et al. (2019).

PIZZAORDER 
    (NUMBER two) 
    (SIZE medium) 
    pizzas with 
    (TOPPING sausage) 
    and 
    (TOPPING black olives)

and

PIZZAORDER 
    (NUMBER two) 
    (SIZE medium) 
    pizzas with 
    (TOPPING pepperoni) 
    and 
    (COMPLEX_TOPPING 
        (QUANTITY extra) 
        (TOPPING cheese)
    )

and

PIZZAORDER 
    (NUMBER three) 
    (SIZE large) 
    pizzas with 
    (TOPPING pepperoni) 
    and 
    (TOPPING sausage)

'''
def extract_labels_IS(top: str, entities):
    # Extract words and parenthesis
    token_pattern = r"\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b|[()]"
    # token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b"
    tokens = regexp_tokenize(top, token_pattern)
   
    labels = []
    count = 0
  
    is_beginning = True
    order_type = "PIZZAORDER"
    for i, token in enumerate(tokens):
       
        if token in entities and token not in ["PIZZAORDER", "DRINKORDER"]:
            continue 
      
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
        elif token == "PIZZAORDER":
            order_type = "PIZZAORDER"
        elif token == "DRINKORDER":
            order_type = "DRINKORDER"
        
        elif count == 0:
            print(token)
            labels.append("O")
            is_beginning = True
        else:
            if is_beginning == True:
                labels.append("B-" + order_type)
                is_beginning = False
                continue
            if is_beginning == False:
                labels.append("I-" + order_type)
    # labels = label_encoder.transform(labels)
    return labels
def extract_labels_ner(top: str, entities):
    # Extract words and parenthesis
    token_pattern=r"(?u)\b\w+(?:'\w+)?(?:\s*-\s*\w+)*\b"
    tokens = regexp_tokenize(top, token_pattern)
    print(tokens)
    labels = []
    count = 0
    not_str =""
    complex_topping_begin = False 
    is_beginning = True
    order_type = ""
    for token in tokens:
        if token in ["PIZZAORDER", "DRINKORDER" ,"ORDER"]:
            count -= 1
            continue
        elif token == "(":
            count += 1
        elif token == ")":
            count -= 1
            if count == 0:
                is_beginning = True
                complex_topping_begin = False
                not_str = ""
                order_type = ""
            if count < 0:
                count = 0
        elif token == "COMPLEX_TOPPING":
            order_type = "COMPLEX_TOPPING"
            complex_topping_begin = True
        elif token == "NOT":
            not_str = "NOT_"
        elif token in entities:
            order_type = token
        elif count == 0:
            labels.append("O")
           
        else:
            if complex_topping_begin:
                if is_beginning:
                    labels.append("B-COMPLEX_" + order_type)
                    is_beginning = False
                    continue
                else:
                    labels.append("I-COMPLEX_" + order_type)
                    continue
            if is_beginning:
                labels.append("B-" + not_str + order_type)
                is_beginning = False
                continue
            else:
                labels.append("I-" + not_str + order_type)
    return labels



In [49]:
entities = ['TOPPING',
 'DRINKORDER',
 'COMPLEX_TOPPING',
 'CONTAINERTYPE',
 'NOT',
 'PIZZAORDER',
 'NUMBER',
 'DRINKTYPE',
 'VOLUME',
 'QUANTITY',
 'SIZE',
 'STYLE',
 'ORDER',
 ] 
# top=" i want to order (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING sausage ) and (TOPPING black olives ) ) and (PIZZAORDER (NUMBER two ) (SIZE medium ) pizzas with (TOPPING pepperoni ) and (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING cheese ) ) ) and (PIZZAORDER (NUMBER three ) (SIZE large ) pizzas with (TOPPING pepperoni ) and (TOPPING sausage ) )	"
# top ="i'd like (PIZZAORDER (NUMBER three ) pizzas no (NOT (TOPPING american cheese ) ) ) and (DRINKORDER (NUMBER a ) (DRINKTYPE sprite ) ) and (DRINKORDER (NUMBER five ) (DRINKTYPE fantas ) ) and (DRINKORDER (NUMBER one ) (DRINKTYPE perrier jojo ) ) "
top = "(PIZZAORDER (SIZE party - size ) (TOPPING dried peppers ) pizza ) and (DRINKORDER (NUMBER a ) (DRINKTYPE sprite   ))"

# top = "i need to order (PIZZAORDER (NUMBER one ) (SIZE large ) (STYLE vegetarian ) pizza with (COMPLEX_TOPPING (QUANTITY extra ) (TOPPING banana peppers ) ) )	"
ner = extract_labels_ner(top,entities)
IS = extract_labels_IS (top,entities)
print(ner)
print(IS)
print(len(ner))
print(len(IS))  

['PIZZAORDER', 'SIZE', 'party - size', 'TOPPING', 'dried', 'peppers', 'pizza', 'and', 'DRINKORDER', 'NUMBER', 'a', 'DRINKTYPE', 'sprite']
['PIZZAORDER', 'SIZE', 'party - size', 'TOPPING', 'dried', 'peppers', 'pizza', 'and', 'DRINKORDER', 'NUMBER', 'a', 'DRINKTYPE', 'sprite']
party - size
dried
peppers
pizza
and
a
sprite
['B-SIZE', 'I-TOPPING', 'I-TOPPING', 'I-TOPPING', 'I-TOPPING', 'I-NUMBER', 'I-DRINKTYPE']
['O', 'O', 'O', 'O', 'O', 'O', 'O']
7
7
