# Imports

In [None]:
from pathlib import Path
import re
import pandas as pd

import numpy as np
import math
from scipy.special import binom, comb

import plotly.graph_objects as go
import plotly.express as px
from tqdm.auto import tqdm

# Params

In [None]:
datastore = Path("../datastore/").absolute()

In [None]:
data_path = datastore / 'mit_restaurant' / 'MITrestaurant.tsv'

SQUID_INK = 'rgb(8,48,107)'

# Input

## Reading functions

In [None]:
def extract_utt_text(utt: str) -> str:
    """
    Params:
        utt: string of the shape "word_1|slot_type_1 ... word_n|slot_type_n" at inference time
             "word_1|slot_type_1|lang_1 ... word_n|slot_type_n|lang_n" at training time
    """
    utt_tokens = re.findall(r"([^ ]+)\|[^ ]+", utt)

    return " ".join(utt_tokens)


def extract_ner_labels(utt: str) -> list:
    """
    Params:
        utt: string of the shape "word_1|slot_type_1|lang_1 ... word_n|slot_type_n|lang_n"
    """
    return re.findall(r"[^ ]+\|([^ ]+)", utt)


def read_token_level_annotated_data(data_path: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Reads data with token-level language annotation, e.g. "metti|Other|it musica|MediaType|it"
    and extracts additional columns like utterance_text, ner_labels, language_labels
    """

    columns = get_NLU_tsv_columns(data_path)

    data = pd.read_table(data_path, names=columns)

    data.dropna(inplace=True)

    labels = pd.DataFrame()

    labels["tags"] = (
        data["annotation"].apply(lambda utt: extract_ner_labels(utt)).values
    )

    data["utterance_text"] = (
        data["annotation"].apply(lambda utt: extract_utt_text(utt)).values
    )

    return data, labels


def get_NLU_tsv_columns(file):
    with open(file) as f:
        line = f.readline()
    num_columns = len(line.split("\t"))
    if num_columns == 1:
        return ["annotation"]
    elif num_columns == 3:
        return ("domain", "intent", "annotation")
    elif num_columns == 4:
        return ("domain", "intent", "annotation", "customer_id")
    elif num_columns == 5:
        return ("domain", "intent", "annotation", "customer_id", "utterance_id")
    else:
        raise ValueError("Bad .tsv format")

## Reading data

In [None]:
data, labels = read_token_level_annotated_data(data_path)

print(data)

utterances = data['utterance_text'].tolist()
utterances = [utterance.split(' ') for utterance in utterances]

# Pick distribution

In [None]:
def get_unique_utts_count_sorted(df):
    grouped_by_utt = df.groupby('utterance_text')
    grouped_by_utt_size = grouped_by_utt.size().to_frame('count').reset_index()
    grouped_by_utt_size = grouped_by_utt_size.sort_values('count', ascending=False)
    return grouped_by_utt_size

In [None]:
def inv_exp(utterance):
    return math.exp(-len(utterance))

def inv_lin(utterance):
    return 1/len(utterance)

def inv_lin_squared(utterance):
    return 1/len(utterance)**2

def inverse_characters_fifth(utterance):
    return 1/(sum([len(word) for word in utterance]))**4

def inverse_characters_cubic(utterance):
    return 1/(sum([len(word) for word in utterance]))**3

def inv_lin_cubic(utterance):
    return 1/len(utterance)**3

def inv_chars_exponential(utterance):
    return 1/math.exp(sum([len(word) for word in utterance]))

In [None]:
def plot_utt_distribution(utt_df, measure_name):
    
    unique_utts_count = get_unique_utts_count_sorted(utt_df)
    
    index = np.arange(0, len(utt_df))
    norm_factor = len(utt_df)
    
    y_values = unique_utts_count['count'].to_numpy()
    y_values = y_values/norm_factor

    bar = go.Bar(y=y_values, x=index, )
    fig = go.Figure(data=bar)
    
    fig.update_layout(title=f'Most frequent utterances sampled according {measure_name}')
    
    fig.update_xaxes(title='Utterances ordered by number of occurrences')
    fig.update_yaxes(title='Ratio of the dataset')
    fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                      marker_line_width=1.5)
    fig.show()

In [None]:
measure_dict = { 
    'inverse exponential': inv_exp, 
    'inverse cubic linear': inv_lin_cubic,
    'inverse character cubic': inverse_characters_cubic,
    'inverse character fifth': inverse_characters_fifth,
    'inverse exponential characters': inv_chars_exponential,
}

num_samples = int(1e4)

for measure_name, measure in measure_dict.items():
    weights_by_length = np.array([measure(utterance) for utterance in utterances])
    probs_by_length = weights_by_length / sum(weights_by_length)
    
    sampled_indices = np.random.choice(list(range(len(data))), num_samples, p=probs_by_length)
    sampled_df = data.iloc[sampled_indices]
    
    plot_utt_distribution(sampled_df, measure_name)

In [None]:
index = np.arange(1, len(utterances))

# y_values = unique_utts_count['count'].to_numpy()
y_values = np.array([1/(x) for x in index])

bar = go.Bar(y=y_values, x=index, )
fig = go.Figure(data=bar)

fig.update_layout(title=f'Power law')

fig.update_xaxes(title='x')
fig.update_yaxes(title='y')
fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                  marker_line_width=1.5)
fig.show()

# Duplicated data generation

# Params

In [None]:
duplication_ratios = [0.50, 0.70, 0.90]

In [None]:
measures_per_ratio = {0.50: inverse_characters_cubic,  0.70: inverse_characters_cubic, 0.90: inverse_characters_fifth}
weights_by_length = {r: np.array([measures_per_ratio[r](utterance) for utterance in utterances]) for r in duplication_ratios}
probs_by_length = {r: weights_by_length[r] / sum(weights_by_length[r]) for r in duplication_ratios}

## Sampling

\begin{align}
    \frac{d}{n + d} &= r \\
    d &= \frac{r}{1-r} n
\end{align}

In [None]:
num_unique_utts = len(utterances)

number_of_duplicates = {r: int((r * num_unique_utts)/(1-r)) for r in duplication_ratios}
print(number_of_duplicates)

In [None]:
draws = {r: np.random.choice(data['utterance_text'], number_of_duplicates[r],
              p=probs_by_length[r]) for r in duplication_ratios}

duplicated_datasets = {r: None for r in duplication_ratios}

for r in duplication_ratios:
    sampled_df = pd.DataFrame(draws[r], columns=['utterance_text'])
    sample_df_complete = pd.merge(sampled_df, data, on='utterance_text')
    duplicated_datasets[r] = pd.concat([data, sample_df_complete])

# Plot distribution

In [None]:
def plot_sampled_utt_distribution(utt_df, title=''):
    
    unique_utts_count = get_unique_utts_count_sorted(utt_df)
    
    norm_factor = len(utt_df)
    
    y_values = unique_utts_count['count'].to_numpy()/len(utt_df)
    x_values = unique_utts_count['utterance_text']

    bar = go.Bar(y=y_values[:200], x=x_values[:200])
    fig = go.Figure(data=bar)
    
    fig.update_layout(title=title)
    
    fig.update_xaxes(title='Utterances ordered by number of occurrences')
    fig.update_yaxes(title='Ratio of the dataset')
    fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                      marker_line_width=1.5)
    fig.show()

In [None]:
for r in duplication_ratios:
    plot_sampled_utt_distribution(duplicated_datasets[r], title=f'Dataset with duplication ratio {r}')

In [None]:
counter = {}
for utterance in utterances:
    len_utt = sum([len(word) for word in utterance])
    counter[len_utt] = counter.get(len_utt, 0) + 1

num_utts_lens = sorted(counter.items(), key=lambda t:t[0])

print(num_utts_lens)

In [None]:
def utt_num_chars(utt):
    return sum([len(word) for word in utt])

In [None]:
utterances_sorted = sorted(utterances, key=lambda ut:utt_num_chars(ut))
print(utterances_sorted[:10])

In [None]:
weights_by_length = np.array([1/utt_num_chars(utterance)**3 for utterance in utterances_sorted])

print(weights_by_length)

In [None]:
y_values = weights_by_length
x_values = np.arange(0, len(utterances))

bar = go.Bar(y=y_values[:200], x=x_values[:200])
fig = go.Figure(data=bar)


fig.update_xaxes(title='Utterances sorted by length')
fig.update_yaxes(title='Weight')
fig.update_traces(marker_color=SQUID_INK, marker_line_color=SQUID_INK,
                  marker_line_width=1.5)
fig.show()

# Save

In [None]:
datastore = Path("../datastore/").absolute()

In [None]:
for r in duplication_ratios:
    output_path = datastore / "mit_restaurant" / f"duplicated_{r}.tsv"
    df = duplicated_datasets[r][["domain", "intent", "annotation"]]
    df = df.sample(frac=1).reset_index(drop=True)

    df.to_csv(output_path, sep="\t", index=False, header=False)

# Test

In [None]:
mit_05_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.5.tsv")


In [None]:
mit_07_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.7.tsv")

In [None]:
mit_09_df, _ = read_token_level_annotated_data(datastore / "mit_restaurant" / "duplicated_0.9.tsv")

In [None]:
def compute_redundancy(df: pd.DataFrame, col: str = "utterance_text"):
    grouped_by_utt = df.groupby(col)
    grouped_by_utt_size = grouped_by_utt.size().to_frame("count").reset_index()

    num_unique_utterances = len(grouped_by_utt_size)
    num_utterances = len(df)

    redundancy = (num_utterances - num_unique_utterances) / num_utterances

    return redundancy

In [None]:
compute_redundancy(mit_05_df)

In [None]:
compute_redundancy(mit_07_df)

In [None]:
compute_redundancy(mit_09_df)