In [1]:
import numpy as np
import plotly
import logging
logging.getLogger().setLevel(logging.INFO)

from idst_util import trivial
from idst_util import dstc2
from operator import itemgetter
from plotly.graph_objs import Bar, Layout
from plotly.graph_objs.layout import Margin
plotly.offline.init_notebook_mode(connected = True)

[nltk_data] Downloading package punkt to /home/is/andrei-
[nltk_data]     cc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Make sure data is available
trivial.print_idst()
dstc2.check()

# Retrieve raw data
raw_X_train, raw_Y_train, \
raw_X_dev, raw_Y_dev, \
raw_X_test, raw_Y_test, \
ontology = dstc2.retrieve_raw_datasets(train_data_augmentation = False)

INFO:root:+--------------------------------+
INFO:root:|         _ ____  ___________    |
INFO:root:|        (_) __ \/ ___/_  __/    |
INFO:root:|       / / / / /\__ \ / /       |
INFO:root:|      / / /_/ /___/ // /        |
INFO:root:|     /_/_____//____//_/         |
INFO:root:|                                |
INFO:root:+--------------------------------+
INFO:root:|Incremental Dialog State Tracker|
INFO:root:+--------------------------------+
INFO:root:+--------------------------------+
INFO:root:|     Dialog State Tracker 2     |
INFO:root:|         Data Checker           |
INFO:root:+--------------------------------+
INFO:root:Looking for dstc2 directory in .
INFO:root:dstc2 was found!
INFO:root:Looking for dstc2_traindev directory in ./dstc2
INFO:root:dstc2_traindev was found!
INFO:root:Looking for dstc2_test directory in ./dstc2
INFO:root:dstc2_test was found!
INFO:root:Looking for dstc2_scripts directory in ./dstc2
INFO:root:dstc2_scripts was found!
INFO:root:Done!
INFO:root:+-

In [3]:
def retrieve_info(raw_X):
    
    token_to_count = {}
    max_sequence_length = 0
    turns_list = []
    
    for raw_dialog in raw_X:
        turns_count = 0
        for raw_turn in raw_dialog["turns"]:
            turns_count += 1
            current_sequence_length = len(raw_turn["system"]) + len(raw_turn["user"])
            if current_sequence_length > max_sequence_length:
                max_sequence_length = current_sequence_length

            for system_token in raw_turn["system"]:
                token = system_token[0]
                if token not in token_to_count:
                    token_to_count[token] = 1
                else:
                    token_to_count[token] += 1

            for user_token in raw_turn["user"]:
                token = user_token[0]
                if token not in token_to_count:
                    token_to_count[token] = 1
                else:
                    token_to_count[token] += 1
            turns_list.append(turns_count)
    
    return token_to_count, max_sequence_length, np.mean(turns_list)

In [4]:
train_vocabulary, \
train_max_sequence_length, \
train_average_turns_per_dialog = retrieve_info(raw_X_train)


dev_vocabulary, \
dev_max_sequence_length, \
dev_average_turns_per_dialog = retrieve_info(raw_X_dev)

test_vocabulary, \
test_max_sequence_length, \
test_average_turns_per_dialog = retrieve_info(raw_X_test)

In [5]:
logging.info("+--------------------------------+")
logging.info("|     Dialog State Tracker 2     |")
logging.info("|         Data Analysis          |")
logging.info("+--------------------------------+\n")

logging.info("Train number of dialogs:\t{}".format(len(raw_X_train)))
logging.info("Train vocabulary length:\t{}".format(len(train_vocabulary)))
logging.info("Train max sequence length:\t{}".format(train_max_sequence_length))
logging.info("Train average turns:\t\t{}\n".format(train_average_turns_per_dialog))

logging.info("Dev number of dialogs:\t{}".format(len(raw_X_dev)))
logging.info("Dev vocabulary length:\t{}".format(len(dev_vocabulary)))
logging.info("Dev max sequence length:\t{}".format(dev_max_sequence_length))
logging.info("Dev average turns:\t\t{}\n".format(dev_average_turns_per_dialog))

logging.info("Test number of dialogs:\t{}".format(len(raw_X_test)))
logging.info("Test vocabulary length:\t{}".format(len(test_vocabulary)))
logging.info("Test max sequence length:\t{}".format(test_max_sequence_length))
logging.info("Test average turns:\t\t{}\n".format(test_average_turns_per_dialog))

INFO:root:+--------------------------------+
INFO:root:|     Dialog State Tracker 2     |
INFO:root:|         Data Analysis          |
INFO:root:+--------------------------------+

INFO:root:Train number of dialogs:	1612
INFO:root:Train vocabulary length:	922
INFO:root:Train max sequence length:	28
INFO:root:Train average turns:		4.930461591162113

INFO:root:Dev number of dialogs:	506
INFO:root:Dev vocabulary length:	764
INFO:root:Dev max sequence length:	28
INFO:root:Dev average turns:		5.454499237417387

INFO:root:Test number of dialogs:	1117
INFO:root:Test vocabulary length:	901
INFO:root:Test max sequence length:	36
INFO:root:Test average turns:		5.983923154701719



In [6]:
train_X_axis = []
train_Y_axis = []
train_top_tokens = []
sorted_dict = sorted(train_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(train_top_tokens) <= 10:
        train_top_tokens.append((token, token_count))
    if token_count > 50:
        train_X_axis.append(token)
        train_Y_axis.append(token_count)

assert(len(train_X_axis) == len(train_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = train_X_axis,
                 y = train_Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title = "<b>Train tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 3, titlefont = dict(color = "#1abc9c")),
                     yaxis = dict(title = "<b>Frequency</b>", titlefont = dict(color = "#1abc9c")),
                     margin = Margin(b = 150)
                    )
})

logging.info("Train top tokens:\t{}".format(train_top_tokens))

INFO:root:Train top tokens:	[('inform', 7936), ('name', 6162), ('offer', 6077), ('food', 5738), ('the', 3596), ('area', 3331), ('pricerange', 2695), ('phone', 2571), ('restaurant', 2353), ('request', 1910), ('slot', 1910)]


In [7]:
dev_X_axis = []
dev_Y_axis = []
dev_top_tokens = []
sorted_dict = sorted(dev_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(dev_top_tokens) <= 10:
        dev_top_tokens.append((token, token_count))
    if token_count > 20:
        dev_X_axis.append(token)
        dev_Y_axis.append(token_count)

assert(len(dev_X_axis) == len(dev_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = dev_X_axis,
                 y = dev_Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title = "<b>Dev tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 3, titlefont = dict(color = "#3498db")),
                     yaxis = dict(title = "<b>Frequency</b>", titlefont = dict(color = "#3498db")),
                     margin = Margin(b = 150)
                    )
})


logging.info("Dev top tokens:\t{}".format(dev_top_tokens))

INFO:root:Dev top tokens:	[('inform', 2448), ('name', 2039), ('offer', 1997), ('food', 1950), ('the', 1255), ('area', 1124), ('pricerange', 876), ('phone', 775), ('restaurant', 723), ('i', 701), ('request', 631)]


In [8]:
test_X_axis = []
test_Y_axis = []
test_top_tokens = []
sorted_dict = sorted(test_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(test_top_tokens) <= 10:
        test_top_tokens.append((token, token_count))
    if token_count > 40:
        test_X_axis.append(token)
        test_Y_axis.append(token_count)

assert(len(test_X_axis) == len(test_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = test_X_axis,
                 y = test_Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title = "<b>Test tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 3, titlefont = dict(color = "#9b59b6")),
                     yaxis = dict(title = "<b>Frequency</b>", titlefont = dict(color = "#9b59b6")),
                     margin = Margin(b = 150)
                    )
})


logging.info("Test top tokens:\t{}".format(test_top_tokens))

INFO:root:Test top tokens:	[('inform', 6188), ('food', 5873), ('name', 3978), ('offer', 3824), ('area', 3037), ('the', 2681), ('pricerange', 2393), ('request', 2174), ('slot', 2174), ('impl-conf', 1872), ('phone', 1765)]


In [9]:
train_dev_oov_tokens = []

for token, count in dev_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_dev_oov_tokens:
            train_dev_oov_tokens.append(token)

train_dev_oov_rate = len(train_dev_oov_tokens)/len(dev_vocabulary)

logging.info("Train-Dev OOV rate:\t{}\n".format(train_dev_oov_rate))
logging.info("Train-Dev OOV tokens:\t{}".format(train_dev_oov_tokens))

INFO:root:Train-Dev OOV rate:	0.10863874345549739

INFO:root:Train-Dev OOV tokens:	['allowed', 'quarter', 'venue', 'such', 'essentially', 'best', 'twenty', 'search', 'touch', 'move', 'typical', 'direct', 'comments', 'wo', 'noon', 'alone', 'americas', 'recognised', 'drink', 'clowns', '324351', 'nice', 'eclectic', 'third', 'found', 'worry', 'twelve', 'global', 'club', 'hold', 'drop', 'affordable', 'helpful', 'miles', 'available', '52', 'ranges', 'connection', 'brb', 'goes', "'re", 'point', 'mentioned', 'below', 'behind', 'cuisine', 'seen', 'answer', 'pitch', 'recommend', 'eleven', 'tight', 'children', "'ve", 'exact', 'theme', 'content', 'has', 'lasts', 'football', 'come', 'piece', 'true', 'seem', 'around', 'g', 'zealand', 'intermediate', 'cars', 'tour', 'tells', 'arrange', 'stores', 'info', 'still', 'shows', '355711', 'areas', 'meet', 'super', 'small', 'thing', 'signature']


In [10]:
train_test_oov_tokens = []

for token, count in test_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_test_oov_tokens:
            train_test_oov_tokens.append(token)

train_test_oov_rate = len(train_test_oov_tokens)/len(test_vocabulary)

logging.info("Train-Test OOV rate:\t{}\n".format(train_test_oov_rate))
logging.info("Train-Test OOV tokens:\t{}".format(train_test_oov_tokens))

INFO:root:Train-Test OOV rate:	0.16315205327413984

INFO:root:Train-Test OOV tokens:	['count', 'directly', '9', 'tight', 'wo', '109', 'fare', '57', 'benny', 'spend', 'includes', 'essentially', 'quickly', 'excuse', '69', 'make', 'suggestions', 'ho', 'eight', 'spent', 'spain', 'still', 'seen', 'up', 'county', "'re", 'p', 'such', 'meal', 'books', 'dear', 'third', 'storey', 'wow', 'advice', 'confirm', 'salad', 'comment', 'remember', 'h', 'nice', 'stops', 'useless', 'found', 'connect', 'cheapest', 'longer', 'kinds', 'itself', 'reference', 'tomorrow', 'continues', 'signature', 'thru', 'fill', 'pool', 'union', 'twenty', '110', 'has', '56', 'movies', 'forty', 'helpful', 'hate', 'seem', 'names', 'pig', 'quick', 'straight', 'checking', 'beers', 'collecting', 'continue', 'nothing', 'feature', 'selling', 'sheet', 'public', 'walk', 'charging', 'man', 'into', 'going', 'suggest', 'global', 'current', 'smallish', 'special', 'open', 'move', 'onto', 'snack', 'lunch', 'preference', 'cars', 'downwards', '