# Dataset Preparation

In [14]:
import json
import csv
from types import SimpleNamespace
import os

Load original dataset from LUIS (or Language Studio)

In [2]:
datapath = f'data.json'

with open(datapath, 'r') as f:
    data = json.load(f)

# print(data)

Load Intent labels dictionary

In [28]:
intent_labels = {}

print('Loading intent labels...')
if 'intents' in data['assets']:
    intent_labels = {intent_['category']: i for i, intent_ in enumerate(data['assets']['intents'])}
    
    print(json.dumps(intent_labels, indent=4))
else:
    print('no intent labels found!')

Loading intent labels...
{
    "Accept": 0,
    "AskForEquipmentInstallation": 1,
    "AskForInterest": 2,
    "AskForPrice": 3,
    "AskIfBuy": 4,
    "AskIfHaveEquipment": 5,
    "AskIfSell": 6,
    "Decline": 7,
    "DeclinePrice": 8,
    "DeclineToBuy": 9,
    "DeclineToSell": 10,
    "DoNotHaveEquipment": 11,
    "HaveDeinstalledEquipment": 12,
    "HaveEquipment": 13,
    "HaveInstalledEquipment": 14,
    "None": 15,
    "OfferPrice": 16,
    "WantDeal": 17,
    "WantToBuy": 18,
    "WantToSell": 19
}


Load Entity labels dictionary

In [29]:
entity_labels = {}

print('Loading entity labels...')
if 'entities' in data['assets']:
    entity_labels = {entity_['category']: i for i, entity_ in enumerate(data['assets']['entities'])}
    
    print(json.dumps(entity_labels, indent=4))
else:
    print('no intent labels found!')

Loading entity labels...
{
    "Action.AlreadyBought": 0,
    "Action.AlreadySold": 1,
    "Action.Potential": 2,
    "Action.ToSell": 3,
    "Action.ToBuy": 4,
    "Action.Decline": 5,
    "Action.Accept": 6,
    "Action.Have": 7,
    "Action.Interested": 8,
    "Action.OfferOrPay": 9,
    "Actor.FirstPerson": 10,
    "Actor.SecondPerson": 11,
    "Actor.ThirdPerson": 12,
    "AssetAttribute": 13,
    "Attachment": 14,
    "Condition.AsIs": 15,
    "Condition.Excellent": 16,
    "Condition.Good": 17,
    "Condition.MissingParts": 18,
    "Condition.New": 19,
    "Condition.Refurbished": 20,
    "Condition.VeryGood": 21,
    "Condition.Working": 22,
    "Condition.Poor": 23,
    "Quantity.Any": 24,
    "Quantity.Lot": 25,
    "Quantity.Number": 26,
    "Equipment.Vintage": 27,
    "Equipment.MakeModel": 28,
    "Equipment.Type": 29,
    "Composition.Complete": 30,
    "Composition.SpareParts": 31,
    "State.Installed": 32,
    "State.Deinstalled": 33,
    "State.Demonstrable": 34,
   

In [10]:
utterances = data['assets']['utterances']
print(f'Utterances: {len(utterances)}')


Utterances: 6862


## Create Intents dataset

In [23]:
intent_dataset_name = 'intents-gen.csv'

if os.path.exists(intent_dataset_name):
    os.remove(intent_dataset_name)

with open(intent_dataset_name, 'w', newline='') as f:
    writer = csv.writer(f)

    header = ['intent', 'utterance']
    writer.writerow(header)

    for utt in utterances:
        row = [utt['intent'], utt['text']]
        writer.writerow(row)


## Create Entities dataset

In [30]:
label_2_tag = {
    "Action.AlreadyBought": "act_bought",
    "Action.AlreadySold": "act_sold",
    "Action.Potential": "act_pot",
    "Action.ToSell": "act_sell",
    "Action.ToBuy": "act_buy",
    "Action.Decline": "act_dec",
    "Action.Accept": "act_acc",
    "Action.Have": "act_have",
    "Action.Interested": "act_int",
    "Action.OfferOrPay": "act_pay",
    "Actor.FirstPerson": "per_fir",
    "Actor.SecondPerson": "per_sec",
    "Actor.ThirdPerson": "per_thr",
    "AssetAttribute": "aattr",
    "Attachment": "attach",
    "Condition.AsIs": "cond_asis",
    "Condition.Excellent": "cond_exc",
    "Condition.Good": "cond_good",
    "Condition.MissingParts": "cond_miss",
    "Condition.New": "cond_new",
    "Condition.Refurbished": "cond_ref",
    "Condition.VeryGood": "cond_vgood",
    "Condition.Working": "cond_work",
    "Condition.Poor": "cond_poor",
    "Quantity.Any": "qty_any",
    "Quantity.Lot": "qty_lot",
    "Quantity.Number": "qty_num",
    "Equipment.Vintage": "eq_vint",
    "Equipment.MakeModel": "eq_mm",
    "Equipment.Type": "eq_type",
    "Composition.Complete": "comp_compl",
    "Composition.SpareParts": "comp_sp",
    "State.Installed": "state_inst",
    "State.Deinstalled": "state_dinst",
    "State.Demonstrable": "state_demo",
    "State.NonDemonstrable": "state_ndemo",
    "State.Idled": "state_idle",
    "State.Crated": "state_crat",
    "Equipment.Location": "eq_loc",
    "Equipment.WaferSize": "eq_wafs",
    "Facility.Storage": "fac_stor",
    "Facility.Production": "fac_prod",
    "Price.Value": "pr_val",
    "Category.ExWork": "cat_exw",
    "Category.Other": "cat_oth",
    "Category.SellerAsk": "cat_ask",
    "Category.BuyerBid": "cat_bid",
    "Category.MinPrice": "cat_min",
    "Category.MaxPrice": "cat_max",
    "Category.Range": "cat_rang",
    "Price.Currency": "pr_curr",
    "Price.LotSize": "pr_size",
    "QuestionAttribute": "qattr",
    "Request": "req",
    "Timeline": "tline"
}

label_2_btag = {
    "Action.AlreadyBought": "B-act_bought",
    "Action.AlreadySold": "B-act_sold",
    "Action.Potential": "B-act_pot",
    "Action.ToSell": "B-act_sell",
    "Action.ToBuy": "B-act_buy",
    "Action.Decline": "B-act_dec",
    "Action.Accept": "B-act_acc",
    "Action.Have": "B-act_have",
    "Action.Interested": "B-act_int",
    "Action.OfferOrPay": "B-act_pay",
    "Actor.FirstPerson": "B-per_fir",
    "Actor.SecondPerson": "B-per_sec",
    "Actor.ThirdPerson": "B-per_thr",
    "AssetAttribute": "B-aattr",
    "Attachment": "B-attach",
    "Condition.AsIs": "B-cond_asis",
    "Condition.Excellent": "B-cond_exc",
    "Condition.Good": "B-cond_good",
    "Condition.MissingParts": "B-cond_miss",
    "Condition.New": "B-cond_new",
    "Condition.Refurbished": "B-cond_ref",
    "Condition.VeryGood": "B-cond_vgood",
    "Condition.Working": "B-cond_work",
    "Condition.Poor": "B-cond_poor",
    "Quantity.Any": "B-qty_any",
    "Quantity.Lot": "B-qty_lot",
    "Quantity.Number": "B-qty_num",
    "Equipment.Vintage": "B-eq_vint",
    "Equipment.MakeModel": "B-eq_mm",
    "Equipment.Type": "B-eq_type",
    "Composition.Complete": "B-comp_compl",
    "Composition.SpareParts": "B-comp_sp",
    "State.Installed": "B-state_inst",
    "State.Deinstalled": "B-state_dinst",
    "State.Demonstrable": "B-state_demo",
    "State.NonDemonstrable": "B-state_ndemo",
    "State.Idled": "B-state_idle",
    "State.Crated": "B-state_crat",
    "Equipment.Location": "B-eq_loc",
    "Equipment.WaferSize": "B-eq_wafs",
    "Facility.Storage": "B-fac_stor",
    "Facility.Production": "B-fac_prod",
    "Price.Value": "B-pr_val",
    "Category.ExWork": "B-cat_exw",
    "Category.Other": "B-cat_oth",
    "Category.SellerAsk": "B-cat_ask",
    "Category.BuyerBid": "B-cat_bid",
    "Category.MinPrice": "B-cat_min",
    "Category.MaxPrice": "B-cat_max",
    "Category.Range": "B-cat_rang",
    "Price.Currency": "B-pr_curr",
    "Price.LotSize": "B-pr_size",
    "QuestionAttribute": "B-qattr",
    "Request": "B-req",
    "Timeline": "B-tline"
}

label_2_itag = {
    "Action.AlreadyBought": "I-act_bought",
    "Action.AlreadySold": "I-act_sold",
    "Action.Potential": "I-act_pot",
    "Action.ToSell": "I-act_sell",
    "Action.ToBuy": "I-act_buy",
    "Action.Decline": "I-act_dec",
    "Action.Accept": "I-act_acc",
    "Action.Have": "I-act_have",
    "Action.Interested": "I-act_int",
    "Action.OfferOrPay": "I-act_pay",
    "Actor.FirstPerson": "I-per_fir",
    "Actor.SecondPerson": "I-per_sec",
    "Actor.ThirdPerson": "I-per_thr",
    "AssetAttribute": "I-aattr",
    "Attachment": "I-attach",
    "Condition.AsIs": "I-cond_asis",
    "Condition.Excellent": "I-cond_exc",
    "Condition.Good": "I-cond_good",
    "Condition.MissingParts": "I-cond_miss",
    "Condition.New": "I-cond_new",
    "Condition.Refurbished": "I-cond_ref",
    "Condition.VeryGood": "I-cond_vgood",
    "Condition.Working": "I-cond_work",
    "Condition.Poor": "I-cond_poor",
    "Quantity.Any": "I-qty_any",
    "Quantity.Lot": "I-qty_lot",
    "Quantity.Number": "I-qty_num",
    "Equipment.Vintage": "I-eq_vint",
    "Equipment.MakeModel": "I-eq_mm",
    "Equipment.Type": "I-eq_type",
    "Composition.Complete": "I-comp_compl",
    "Composition.SpareParts": "I-comp_sp",
    "State.Installed": "I-state_inst",
    "State.Deinstalled": "I-state_dinst",
    "State.Demonstrable": "I-state_demo",
    "State.NonDemonstrable": "I-state_ndemo",
    "State.Idled": "I-state_idle",
    "State.Crated": "I-state_crat",
    "Equipment.Location": "I-eq_loc",
    "Equipment.WaferSize": "I-eq_wafs",
    "Facility.Storage": "I-fac_stor",
    "Facility.Production": "I-fac_prod",
    "Price.Value": "I-pr_val",
    "Category.ExWork": "I-cat_exw",
    "Category.Other": "I-cat_oth",
    "Category.SellerAsk": "I-cat_ask",
    "Category.BuyerBid": "I-cat_bid",
    "Category.MinPrice": "I-cat_min",
    "Category.MaxPrice": "I-cat_max",
    "Category.Range": "I-cat_rang",
    "Price.Currency": "I-pr_curr",
    "Price.LotSize": "I-pr_size",
    "QuestionAttribute": "I-qattr",
    "Request": "I-req",
    "Timeline": "I-tline"
}

In [82]:
ent_data = []

# for utt_ in utterances[:10]:
for utt_ in utterances:
    text_ = utt_['text']
    
    entities_ = []
    for ent_ in utt_['entities']:
        start = int(ent_['offset'])
        end = start + int(ent_['length'])
        tag = label_2_tag[ent_['category']]

        entities_.append((start,end,tag))

    # if len(entities_) == 0:
    #     print(f"Skip utterance: `{text_}`")
    #     continue

    ent_data.append((text_, {'entities': entities_}))
    
    
# print(ent_data)

In [86]:
# !python3 -m spacy download en_core_web_sm
# !python3 -m spacy download en_core_web_lg

import spacy
from spacy.training import offsets_to_biluo_tags, biluo_to_iob

# nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_lg')

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [100]:
entities_dataset_name = 'entities-gen.csv'

if os.path.exists(entities_dataset_name):
    os.remove(entities_dataset_name)

with open(entities_dataset_name, 'w', newline='') as f:
    writer = csv.writer(f)

    header = ['utterance', 'labels']
    writer.writerow(header)

    for text, annot in ent_data:
        doc = nlp(text)

        utterance_ = ' '.join([str(t) for t in doc])
        tags_ = offsets_to_biluo_tags(doc, annot['entities'])
        # print(utterance_)
        # print(tags_)
        # tags_ = [str(t_).replace("U-", "B-").replace("L-", "I-") for t_ in tags_]
        # tags_ = offsets_to_biluo_tags(nlp.make_doc(text), annot['entities'])
        tags_ = biluo_to_iob(tags_)

        labels = ' '.join(tags_)

        row = [utterance_, labels]
        writer.writerow(row)



In [101]:
# tags_ = ['U-pr_curr', 'B-pr_val', 'I-pr_val', 'L-pr_val', 'O', 'O', 'O', 'O', 'O', 'O', 'B-eq_mm', 'I-eq_mm', 'I-eq_mm', 'L-eq_mm', 'O']

# tags_ = [t_.replace("U-", "B-").replace("L-", "I-") for t_ in tags_]
# print(tags_)

# test_text = "we may offer 5. 000£."
# test_text = "$1mil-$1.6mil for a tool."
test_text ="we have a 5781 s."
doc = nlp(test_text)

utterance_ = ' '.join([str(t) for t in doc])

print(utterance_)


$ 1mil-$1.6mil for a tool .


In [102]:
# test_text = "we may offer 5. 000£."

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

encoded = tokenizer(test_text)
# print(encoded)

toks = tokenizer.convert_ids_to_tokens(encoded['input_ids'])
print(toks)


# import torchtext
# from torchtext.data import get_tokenizer
# tokenizer = get_tokenizer("basic_english")
# tokens = tokenizer(test_text)
# tokens


['[CLS]', '$', '1', '##mi', '##l', '-', '$', '1', '.', '6', '##mi', '##l', 'for', 'a', 'tool', '.', '[SEP]']
