In [None]:
import numpy as np
import plotly
import logging
logging.getLogger().setLevel(logging.INFO)

from idst_ittd_util import trivial
from idst_ittd_util import dstc2
from operator import itemgetter
from plotly.graph_objs import Bar, Layout
from plotly.graph_objs.layout import Margin
plotly.offline.init_notebook_mode(connected = True)

In [None]:
# Make sure data is available
trivial.print_idst()
dstc2.check()

# Retrieve raw data
raw_X_train, raw_Y_train, \
raw_X_dev, raw_Y_dev, \
raw_X_test, raw_Y_test, \
ontology = dstc2.retrieve_raw_datasets(only_transcript = False)

In [None]:
def retrieve_info(raw_X, only_user_utterance = False):
    
    token_to_count = {}
    max_sequence_length = 0
    turns_list = []
    tokens_per_turn = []
    
    for raw_dialog in raw_X:
        turns_count = 0
        for raw_turn in raw_dialog["turns"]:
            turns_count += 1
            if only_user_utterance:
                tokens_scores = raw_turn["user"]
            else:    
                tokens_scores = raw_turn["system"] + raw_turn["user"]
            for token_score in tokens_scores:
                token = token_score[0]
                if token not in token_to_count:
                    token_to_count[token] = 1
                else:
                    token_to_count[token] += 1
            turns_list.append(turns_count)
            turn_length = len(tokens_scores)
            if turn_length > max_sequence_length:
                max_sequence_length = turn_length
            tokens_per_turn.append(turn_length)
    
    return token_to_count, max_sequence_length, np.mean(turns_list), np.mean(tokens_per_turn)

In [None]:
train_vocabulary, \
train_max_sequence_length, \
train_average_turns_per_dialog, \
train_average_tokens_per_turn = retrieve_info(raw_X_train, only_user_utterance = True)


dev_vocabulary, \
dev_max_sequence_length, \
dev_average_turns_per_dialog, \
dev_average_tokens_per_turn = retrieve_info(raw_X_dev, only_user_utterance = True)

test_vocabulary, \
test_max_sequence_length, \
test_average_turns_per_dialog, \
test_average_tokens_per_turn = retrieve_info(raw_X_test, only_user_utterance = True)

In [None]:
logging.info("+--------------------------------+")
logging.info("|     Dialog State Tracker 2     |")
logging.info("|         Data Analysis          |")
logging.info("+--------------------------------+\n")

logging.info("TRAIN number of dialogs:\t\t{}".format(len(raw_X_train)))
logging.info("TRAIN vocabulary length:\t\t{}".format(len(train_vocabulary)))
logging.info("TRAIN max sequence length:\t\t{}".format(train_max_sequence_length))
logging.info("TRAIN average turns per dialog:\t{}".format(train_average_turns_per_dialog))
logging.info("TRAIN average tokens per turn:\t{}\n".format(train_average_tokens_per_turn))

logging.info("DEV number of dialogs:\t\t{}".format(len(raw_X_dev)))
logging.info("DEV vocabulary length:\t\t{}".format(len(dev_vocabulary)))
logging.info("DEV max sequence length:\t\t{}".format(dev_max_sequence_length))
logging.info("DEV average turns per dialog:\t\t{}".format(dev_average_turns_per_dialog))
logging.info("DEV average tokens per turn:\t\t{}\n".format(dev_average_tokens_per_turn))

logging.info("TEST number of dialogs:\t\t{}".format(len(raw_X_test)))
logging.info("TEST vocabulary length:\t\t{}".format(len(test_vocabulary)))
logging.info("TEST max sequence length:\t\t{}".format(test_max_sequence_length))
logging.info("TEST average turns per dialog:\t{}".format(test_average_turns_per_dialog))
logging.info("TEST average tokens per turn:\t\t{}\n".format(test_average_tokens_per_turn))

In [None]:
train_X_axis = []
train_Y_axis = []
train_top_tokens = []
sorted_dict = sorted(train_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(train_top_tokens) <= 10:
        train_top_tokens.append((token, token_count))
    if token_count > 100:
        train_X_axis.append(token)
        train_Y_axis.append(token_count)

assert(len(train_X_axis) == len(train_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = train_X_axis,
                 y = train_Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title = "<b>TRAIN tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})

logging.info("Train top tokens:\t{}".format(train_top_tokens))

In [None]:
dev_X_axis = []
dev_Y_axis = []
dev_top_tokens = []
sorted_dict = sorted(dev_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(dev_top_tokens) <= 10:
        dev_top_tokens.append((token, token_count))
    if token_count > 40:
        dev_X_axis.append(token)
        dev_Y_axis.append(token_count)

assert(len(dev_X_axis) == len(dev_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = dev_X_axis,
                 y = dev_Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title = "<b>DEV tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})


logging.info("Dev top tokens:\t{}".format(dev_top_tokens))

In [None]:
test_X_axis = []
test_Y_axis = []
test_top_tokens = []
sorted_dict = sorted(test_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(test_top_tokens) <= 10:
        test_top_tokens.append((token, token_count))
    if token_count > 70:
        test_X_axis.append(token)
        test_Y_axis.append(token_count)

assert(len(test_X_axis) == len(test_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = test_X_axis,
                 y = test_Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title = "<b>TEST tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})


logging.info("Test top tokens:\t{}".format(test_top_tokens))

In [None]:
train_dev_oov_tokens = []

for token, count in dev_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_dev_oov_tokens:
            train_dev_oov_tokens.append(token)

train_dev_oov_rate = len(train_dev_oov_tokens)/len(dev_vocabulary)

logging.info("Train-Dev OOV rate:\t{}\n".format(train_dev_oov_rate))
logging.info("Train-Dev OOV tokens:\t{}".format(train_dev_oov_tokens))

In [None]:
train_test_oov_tokens = []

for token, count in test_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_test_oov_tokens:
            train_test_oov_tokens.append(token)

train_test_oov_rate = len(train_test_oov_tokens)/len(test_vocabulary)

logging.info("Train-Test OOV rate:\t{}\n".format(train_test_oov_rate))
logging.info("Train-Test OOV tokens:\t{}".format(train_test_oov_tokens))