# Imports

In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -e ../

Obtaining file:///Users/andccl/Desktop/unique-batches
  Preparing metadata (setup.py) ... [?25ldone
Installing collected packages: unique-batches
  Attempting uninstall: unique-batches
    Found existing installation: unique-batches 2.0.0
    Uninstalling unique-batches-2.0.0:
      Successfully uninstalled unique-batches-2.0.0
  Running setup.py develop for unique-batches
Successfully installed unique-batches-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install matplotlib seaborn plotly

Collecting matplotlib
  Downloading matplotlib-3.7.1-cp310-cp310-macosx_10_12_x86_64.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting seaborn
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Collecting plotly
  Downloading plotly-5.14.1-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.0.7-cp310-cp310-macosx_10_9_x86_64.whl (244 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.39.4-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?2

In [4]:
from pathlib import Path
import re
import pandas as pd

import numpy as np
import math
from scipy.special import binom, comb

import plotly.graph_objects as go
import plotly.express as px
from tqdm.auto import tqdm

# Params

In [5]:
datastore = Path("../datastore/").absolute()

In [6]:
data_path = datastore / 'mit_restaurant' / 'MITrestaurant.tsv'

SQUID_INK = 'rgb(8,48,107)'

# Input

## Reading functions

In [7]:
def extract_utt_text(utt: str) -> str:
    """
    Params:
        utt: string of the shape "word_1|slot_type_1 ... word_n|slot_type_n" at inference time
             "word_1|slot_type_1|lang_1 ... word_n|slot_type_n|lang_n" at training time
    """
    utt_tokens = re.findall(r"([^ ]+)\|[^ ]+", utt)

    return " ".join(utt_tokens)


def extract_ner_labels(utt: str) -> list:
    """
    Params:
        utt: string of the shape "word_1|slot_type_1|lang_1 ... word_n|slot_type_n|lang_n"
    """
    return re.findall(r"[^ ]+\|([^ ]+)", utt)


def read_token_level_annotated_data(data_path: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Reads data with token-level language annotation, e.g. "metti|Other|it musica|MediaType|it"
    and extracts additional columns like utterance_text, ner_labels, language_labels
    """

    columns = get_NLU_tsv_columns(data_path)

    data = pd.read_table(data_path, names=columns)

    data.dropna(inplace=True)

    labels = pd.DataFrame()

    labels["tags"] = (
        data["annotation"].apply(lambda utt: extract_ner_labels(utt)).values
    )

    data["utterance_text"] = (
        data["annotation"].apply(lambda utt: extract_utt_text(utt)).values
    )

    return data, labels


def get_NLU_tsv_columns(file):
    with open(file) as f:
        line = f.readline()
    num_columns = len(line.split("\t"))
    if num_columns == 1:
        return ["annotation"]
    elif num_columns == 3:
        return ("domain", "intent", "annotation")
    elif num_columns == 4:
        return ("domain", "intent", "annotation", "customer_id")
    elif num_columns == 5:
        return ("domain", "intent", "annotation", "customer_id", "utterance_id")
    else:
        raise ValueError("Bad .tsv format")

## Reading data

In [8]:
data, labels = read_token_level_annotated_data(data_path)

print(data)

utterances = data['utterance_text'].tolist()
utterances = [utterance.split(' ') for utterance in utterances]

           domain       intent   
0     dummyDomain  dummyIntent  \
1     dummyDomain  dummyIntent   
2     dummyDomain  dummyIntent   
3     dummyDomain  dummyIntent   
4     dummyDomain  dummyIntent   
...           ...          ...   
9175  dummyDomain  dummyIntent   
9176  dummyDomain  dummyIntent   
9177  dummyDomain  dummyIntent   
9178  dummyDomain  dummyIntent   
9179  dummyDomain  dummyIntent   

                                             annotation   
0     can|O you|O find|O me|O the|O cheapest|B-Price...  \
1     can|O you|O find|O me|O the|O closed|B-Locatio...   
2     can|O you|O find|O me|O the|O closest|B-Locati...   
3     can|O you|O find|O me|O the|O closet|B-Locatio...   
4     can|O you|O find|O me|O the|O coast|B-Restaura...   
...                                                 ...   
9175  will|O waffle|B-RestaurantName house|I-Restaur...   
9176  yes|O please|O get|O me|O mcdonalds|B-Restaura...   
9177  yes|O the|O new|O diner|B-Cuisine on|O south|B...   
9

# Pick distribution

In [9]:
def get_unique_utts_count_sorted(df):
    grouped_by_utt = df.groupby('utterance_text')
    grouped_by_utt_size = grouped_by_utt.size().to_frame('count').reset_index()
    grouped_by_utt_size = grouped_by_utt_size.sort_values('count', ascending=False)
    return grouped_by_utt_size

In [10]:
def inv_exp(utterance):
    return math.exp(-len(utterance))

def inv_lin(utterance):
    return 1/len(utterance)

def inv_lin_squared(utterance):
    return 1/len(utterance)**2

def inverse_characters_fifth(utterance):
    return 1/(sum([len(word) for word in utterance]))**4

def inverse_characters_cubic(utterance):
    return 1/(sum([len(word) for word in utterance]))**3

def inv_lin_cubic(utterance):
    return 1/len(utterance)**3

def inv_chars_exponential(utterance):
    return 1/math.exp(sum([len(word) for word in utterance]))

In [11]:
def plot_utt_distribution(utt_df, measure_name):
    
    unique_utts_count = get_unique_utts_count_sorted(utt_df)
    
    index = np.arange(0, len(utt_df))
    norm_factor = len(utt_df)
    
    y_values = unique_utts_count['count'].to_numpy()
    y_values = y_values/norm_factor

    bar = go.Bar(y=y_values, x=index, )
    fig = go.Figure(data=bar)
    
    fig.update_layout(title=f'Most frequent utterances sampled according {measure_name}')
    
    fig.update_xaxes(title='Utterances ordered by number of occurrences')
    fig.update_yaxes(title='Ratio of the dataset')
    fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                      marker_line_width=1.5)
    fig.show()

In [12]:
measure_dict = { 
    'inverse exponential': inv_exp, 
    'inverse cubic linear': inv_lin_cubic,
    'inverse character cubic': inverse_characters_cubic,
    'inverse character fifth': inverse_characters_fifth,
    'inverse exponential characters': inv_chars_exponential,
}

num_samples = int(1e4)

for measure_name, measure in measure_dict.items():
    weights_by_length = np.array([measure(utterance) for utterance in utterances])
    probs_by_length = weights_by_length / sum(weights_by_length)
    
    sampled_indices = np.random.choice(list(range(len(data))), num_samples, p=probs_by_length)
    sampled_df = data.iloc[sampled_indices]
    
    plot_utt_distribution(sampled_df, measure_name)

In [13]:
index = np.arange(1, len(utterances))

# y_values = unique_utts_count['count'].to_numpy()
y_values = np.array([1/(x) for x in index])

bar = go.Bar(y=y_values, x=index, )
fig = go.Figure(data=bar)

fig.update_layout(title=f'Power law')

fig.update_xaxes(title='x')
fig.update_yaxes(title='y')
fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                  marker_line_width=1.5)
fig.show()

# Duplicated data generation

# Params

In [14]:
duplication_ratios = [0.50, 0.70, 0.90]

In [15]:
measures_per_ratio = {0.50: inverse_characters_cubic,  0.70: inverse_characters_cubic, 0.90: inverse_characters_fifth}
weights_by_length = {r: np.array([measures_per_ratio[r](utterance) for utterance in utterances]) for r in duplication_ratios}
probs_by_length = {r: weights_by_length[r] / sum(weights_by_length[r]) for r in duplication_ratios}

## Sampling

\begin{align}
    \frac{d}{n + d} &= r \\
    d &= \frac{r}{1-r} n
\end{align}

In [16]:
num_unique_utts = len(utterances)

number_of_duplicates = {r: int((r * num_unique_utts)/(1-r)) for r in duplication_ratios}
print(number_of_duplicates)

{0.5: 9180, 0.7: 21419, 0.9: 82620}


In [17]:
draws = {r: np.random.choice(data['utterance_text'], number_of_duplicates[r],
              p=probs_by_length[r]) for r in duplication_ratios}

duplicated_datasets = {r: None for r in duplication_ratios}

for r in duplication_ratios:
    sampled_df = pd.DataFrame(draws[r], columns=['utterance_text'])
    sample_df_complete = pd.merge(sampled_df, data, on='utterance_text')
    duplicated_datasets[r] = pd.concat([data, sample_df_complete])

# Plot distribution

In [18]:
def plot_sampled_utt_distribution(utt_df, title=''):
    
    unique_utts_count = get_unique_utts_count_sorted(utt_df)
    
    norm_factor = len(utt_df)
    
    y_values = unique_utts_count['count'].to_numpy()/len(utt_df)
    x_values = unique_utts_count['utterance_text']

    bar = go.Bar(y=y_values[:200], x=x_values[:200])
    fig = go.Figure(data=bar)
    
    fig.update_layout(title=title)
    
    fig.update_xaxes(title='Utterances ordered by number of occurrences')
    fig.update_yaxes(title='Ratio of the dataset')
    fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                      marker_line_width=1.5)
    fig.show()

In [19]:
for r in duplication_ratios:
    plot_sampled_utt_distribution(duplicated_datasets[r], title=f'Dataset with duplication ratio {r}')

In [20]:
counter = {}
for utterance in utterances:
    len_utt = sum([len(word) for word in utterance])
    counter[len_utt] = counter.get(len_utt, 0) + 1

num_utts_lens = sorted(counter.items(), key=lambda t:t[0])

print(num_utts_lens)

[(3, 3), (4, 1), (5, 4), (6, 6), (7, 14), (8, 10), (9, 7), (10, 9), (11, 19), (12, 15), (13, 30), (14, 40), (15, 52), (16, 60), (17, 73), (18, 66), (19, 95), (20, 102), (21, 119), (22, 136), (23, 178), (24, 191), (25, 198), (26, 220), (27, 239), (28, 254), (29, 240), (30, 265), (31, 246), (32, 282), (33, 244), (34, 286), (35, 266), (36, 254), (37, 300), (38, 274), (39, 248), (40, 226), (41, 248), (42, 222), (43, 206), (44, 217), (45, 186), (46, 168), (47, 202), (48, 174), (49, 178), (50, 177), (51, 138), (52, 115), (53, 144), (54, 112), (55, 106), (56, 95), (57, 100), (58, 84), (59, 79), (60, 74), (61, 85), (62, 68), (63, 68), (64, 62), (65, 50), (66, 55), (67, 38), (68, 35), (69, 37), (70, 34), (71, 29), (72, 32), (73, 20), (74, 23), (75, 25), (76, 20), (77, 21), (78, 19), (79, 10), (80, 13), (81, 10), (82, 8), (83, 13), (84, 5), (85, 9), (86, 5), (87, 5), (88, 5), (89, 8), (90, 6), (91, 5), (92, 4), (93, 6), (94, 3), (95, 4), (96, 3), (97, 5), (98, 3), (99, 3), (100, 3), (101, 1), (1

In [21]:
def utt_num_chars(utt):
    return sum([len(word) for word in utt])

In [22]:
utterances_sorted = sorted(utterances, key=lambda ut:utt_num_chars(ut))
print(utterances_sorted[:10])

[['kfc'], ['yes'], ['pub'], ['thai'], ['pizza'], ['sushi'], ['asian'], ['diner'], ['chilis'], ['donuts']]


In [23]:
weights_by_length = np.array([1/utt_num_chars(utterance)**3 for utterance in utterances_sorted])

print(weights_by_length)

[3.70370370e-02 3.70370370e-02 3.70370370e-02 ... 4.76837158e-07
 4.15609633e-07 3.72353636e-07]


In [24]:
y_values = weights_by_length
x_values = np.arange(0, len(utterances))

bar = go.Bar(y=y_values[:200], x=x_values[:200])
fig = go.Figure(data=bar)


fig.update_xaxes(title='Utterances sorted by length')
fig.update_yaxes(title='Weight')
fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                  marker_line_width=1.5)
fig.show()

# Save

In [25]:
datastore = Path("../datastore/").absolute()

In [26]:
for r in duplication_ratios:
    output_path = datastore / "mit_restaurant" / f"duplicated_{r}.tsv"
    df = duplicated_datasets[r][["domain", "intent", "annotation"]]
    df = df.sample(frac=1).reset_index(drop=True)

    df.to_csv(output_path, sep="\t", index=False, header=False)

# Check

In [34]:
mit_05_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.5.tsv")
mit_05_df

Unnamed: 0,domain,intent,annotation,utterance_text
0,dummyDomain,dummyIntent,does|O bewiched|B-RestaurantName deliver|B-Ame...,does bewiched deliver
1,dummyDomain,dummyIntent,cookies|B-Dish,cookies
2,dummyDomain,dummyIntent,special|B-Amenity dinner|B-Hours,special dinner
3,dummyDomain,dummyIntent,i|O want|O some|O italian|B-Cuisine food|O del...,i want some italian food delivered to my hotel...
4,dummyDomain,dummyIntent,is|O there|O an|O olive|B-RestaurantName garde...,is there an olive garden around here
...,...,...,...,...
18355,dummyDomain,dummyIntent,where|O is|O evs|B-RestaurantName usa|I-Restau...,where is evs usa
18356,dummyDomain,dummyIntent,is|O zabaglione|B-RestaurantName close|B-Location,is zabaglione close
18357,dummyDomain,dummyIntent,i|O am|O looking|O for|O a|O mexican|B-Cuisine...,i am looking for a mexican restuarant that has...
18358,dummyDomain,dummyIntent,is|O there|O a|O cafe|B-Cuisine nearby|B-Location,is there a cafe nearby


In [35]:
mit_07_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.7.tsv")
mit_07_df

Unnamed: 0,domain,intent,annotation,utterance_text
0,dummyDomain,dummyIntent,where|O is|O ii|B-RestaurantName moro|I-Restau...,where is ii moro
1,dummyDomain,dummyIntent,is|O there|O a|O shao|B-RestaurantName garden|...,is there a shao garden restaurant around with ...
2,dummyDomain,dummyIntent,are|O there|O any|O seafood|B-Cuisine restaura...,are there any seafood restaurants in montauk ny
3,dummyDomain,dummyIntent,late|B-Hours lunch|I-Hours,late lunch
4,dummyDomain,dummyIntent,find|O us|O a|O sushi|B-Cuisine bar|B-Amenity ...,find us a sushi bar near jackson
...,...,...,...,...
30595,dummyDomain,dummyIntent,where|O can|O i|O eat|O sauce|B-Dish before|B-...,where can i eat sauce before 12 am
30596,dummyDomain,dummyIntent,restaurants|O open|O past|B-Hours 10|I-Hours p...,restaurants open past 10 p m
30597,dummyDomain,dummyIntent,get|O me|O to|O the|O nearest|B-Location car|O...,get me to the nearest car repair shop in town
30598,dummyDomain,dummyIntent,is|O there|O a|O korean|B-Cuisine restaurant|O...,is there a korean restaurant within a ten minu...


In [36]:
mit_09_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.9.tsv")
mit_09_df

Unnamed: 0,domain,intent,annotation,utterance_text
0,dummyDomain,dummyIntent,where|O can|O i|O find|O take|B-Amenity out|I-...,where can i find take out
1,dummyDomain,dummyIntent,where|O can|O i|O eat|O crab|B-Dish within|B-L...,where can i eat crab within 2 miles of here fo...
2,dummyDomain,dummyIntent,call|O mcdonalds|B-RestaurantName,call mcdonalds
3,dummyDomain,dummyIntent,yes|O,yes
4,dummyDomain,dummyIntent,yes|O,yes
...,...,...,...,...
91796,dummyDomain,dummyIntent,yes|O,yes
91797,dummyDomain,dummyIntent,how|O is|O ding|B-RestaurantName ho|I-Restaura...,how is ding ho as a date spot
91798,dummyDomain,dummyIntent,kfc|B-RestaurantName,kfc
91799,dummyDomain,dummyIntent,pub|B-Cuisine,pub


In [37]:
def compute_redundancy(df: pd.DataFrame, col: str = "utterance_text"):
    grouped_by_utt = df.groupby(col)
    grouped_by_utt_size = grouped_by_utt.size().to_frame("count").reset_index()

    num_unique_utterances = len(grouped_by_utt_size)
    num_utterances = len(df)

    redundancy = (num_utterances - num_unique_utterances) / num_utterances

    return redundancy

In [31]:
compute_redundancy(mit_05_df)

0.5000544662309369

In [32]:
compute_redundancy(mit_07_df)

0.7000326797385621

In [33]:
compute_redundancy(mit_09_df)

0.9000119824402785

In [38]:
def number_of_entities(df):
    tags = set()
    for idx, row in df.iterrows():
        annotation = row["annotation"]
        ner_tags = [tok_tag.split("|")[1] for tok_tag in annotation.split()]
        for tag in ner_tags:
            tags.add(tag)
    
    return tags

In [39]:
len(number_of_entities(mit_05_df))

17

In [40]:
len(number_of_entities(mit_07_df))

17

In [41]:
len(number_of_entities(mit_09_df))

17