In [1]:
import numpy as np
import plotly
import logging
logging.getLogger().setLevel(logging.INFO)

from idst_util import trivial
from idst_util import dstc2
from operator import itemgetter
from plotly.graph_objs import Bar, Layout
from plotly.graph_objs.layout import Margin
plotly.offline.init_notebook_mode(connected = True)

[nltk_data] Downloading package punkt to /home/is/andrei-
[nltk_data]     cc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Make sure data is available
trivial.print_idst()
dstc2.check()

# Retrieve raw data
raw_X_train, raw_Y_train, \
raw_X_dev, raw_Y_dev, \
raw_X_test, raw_Y_test, \
ontology = dstc2.retrieve_raw_datasets(only_transcript = False)

INFO:root:+--------------------------------+
INFO:root:|         _ ____  ___________    |
INFO:root:|        (_) __ \/ ___/_  __/    |
INFO:root:|       / / / / /\__ \ / /       |
INFO:root:|      / / /_/ /___/ // /        |
INFO:root:|     /_/_____//____//_/         |
INFO:root:|                                |
INFO:root:+--------------------------------+
INFO:root:|Incremental Dialog State Tracker|
INFO:root:+--------------------------------+
INFO:root:+--------------------------------+
INFO:root:|     Dialog State Tracker 2     |
INFO:root:|         Data Checker           |
INFO:root:+--------------------------------+
INFO:root:Looking for dstc2 directory in .
INFO:root:dstc2 was found!
INFO:root:Looking for dstc2_traindev directory in ./dstc2
INFO:root:dstc2_traindev was found!
INFO:root:Looking for dstc2_test directory in ./dstc2
INFO:root:dstc2_test was found!
INFO:root:Looking for dstc2_scripts directory in ./dstc2
INFO:root:dstc2_scripts was found!
INFO:root:Done!
INFO:root:+-

HBox(children=(IntProgress(value=0, max=1612), HTML(value='')))

INFO:root:Extracting raw dev features





HBox(children=(IntProgress(value=0, max=506), HTML(value='')))

INFO:root:Reading dstc2_test.flist
INFO:root:Asserted 1117 dialogs for dstc2_test.flist
INFO:root:Extracting raw test features





HBox(children=(IntProgress(value=0, max=1117), HTML(value='')))

INFO:root:Done!





In [4]:
def retrieve_info(raw_X, only_user_utterance = False):
    
    token_to_count = {}
    max_sequence_length = 0
    turns_list = []
    tokens_per_turn = []
    
    for raw_dialog in raw_X:
        turns_count = 0
        for raw_turn in raw_dialog["turns"]:
            turns_count += 1
            if only_user_utterance:
                tokens_scores = raw_turn["user"]
            else:    
                tokens_scores = raw_turn["system"] + raw_turn["user"]
            for token_score in tokens_scores:
                token = token_score[0]
                if token not in token_to_count:
                    token_to_count[token] = 1
                else:
                    token_to_count[token] += 1
            turns_list.append(turns_count)
            turn_length = len(tokens_scores)
            if turn_length > max_sequence_length:
                max_sequence_length = turn_length
            tokens_per_turn.append(turn_length)
    
    return token_to_count, max_sequence_length, np.mean(turns_list), np.mean(tokens_per_turn)

In [5]:
train_vocabulary, \
train_max_sequence_length, \
train_average_turns_per_dialog, \
train_average_tokens_per_turn = retrieve_info(raw_X_train, only_user_utterance = True)


dev_vocabulary, \
dev_max_sequence_length, \
dev_average_turns_per_dialog, \
dev_average_tokens_per_turn = retrieve_info(raw_X_dev, only_user_utterance = True)

test_vocabulary, \
test_max_sequence_length, \
test_average_turns_per_dialog, \
test_average_tokens_per_turn = retrieve_info(raw_X_test, only_user_utterance = True)

In [7]:
logging.info("+--------------------------------+")
logging.info("|     Dialog State Tracker 2     |")
logging.info("|         Data Analysis          |")
logging.info("+--------------------------------+\n")

logging.info("TRAIN number of dialogs:\t\t{}".format(len(raw_X_train)))
logging.info("TRAIN vocabulary length:\t\t{}".format(len(train_vocabulary)))
logging.info("TRAIN max sequence length:\t\t{}".format(train_max_sequence_length))
logging.info("TRAIN average turns per dialog:\t{}".format(train_average_turns_per_dialog))
logging.info("TRAIN average tokens per turn:\t{}\n".format(train_average_tokens_per_turn))

logging.info("DEV number of dialogs:\t\t{}".format(len(raw_X_dev)))
logging.info("DEV vocabulary length:\t\t{}".format(len(dev_vocabulary)))
logging.info("DEV max sequence length:\t\t{}".format(dev_max_sequence_length))
logging.info("DEV average turns per dialog:\t\t{}".format(dev_average_turns_per_dialog))
logging.info("DEV average tokens per turn:\t\t{}\n".format(dev_average_tokens_per_turn))

logging.info("TEST number of dialogs:\t\t{}".format(len(raw_X_test)))
logging.info("TEST vocabulary length:\t\t{}".format(len(test_vocabulary)))
logging.info("TEST max sequence length:\t\t{}".format(test_max_sequence_length))
logging.info("TEST average turns per dialog:\t{}".format(test_average_turns_per_dialog))
logging.info("TEST average tokens per turn:\t\t{}\n".format(test_average_tokens_per_turn))

INFO:root:+--------------------------------+
INFO:root:|     Dialog State Tracker 2     |
INFO:root:|         Data Analysis          |
INFO:root:+--------------------------------+

INFO:root:TRAIN number of dialogs:		1612
INFO:root:TRAIN vocabulary length:		578
INFO:root:TRAIN max sequence length:		25
INFO:root:TRAIN average turns per dialog:	4.930461591162113
INFO:root:TRAIN average tokens per turn:	3.8784790614027576

INFO:root:DEV number of dialogs:		506
INFO:root:DEV vocabulary length:		435
INFO:root:DEV max sequence length:		21
INFO:root:DEV average turns per dialog:		5.454499237417387
INFO:root:DEV average tokens per turn:		3.91687849517031

INFO:root:TEST number of dialogs:		1117
INFO:root:TEST vocabulary length:		580
INFO:root:TEST max sequence length:		22
INFO:root:TEST average turns per dialog:	5.983923154701719
INFO:root:TEST average tokens per turn:		3.671991911021234



In [8]:
train_X_axis = []
train_Y_axis = []
train_top_tokens = []
sorted_dict = sorted(train_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(train_top_tokens) <= 10:
        train_top_tokens.append((token, token_count))
    if token_count > 100:
        train_X_axis.append(token)
        train_Y_axis.append(token_count)

assert(len(train_X_axis) == len(train_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = train_X_axis,
                 y = train_Y_axis,
                 marker = dict(color = "#1abc9c"))],
    "layout": Layout(title = "<b>TRAIN tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})

logging.info("Train top tokens:\t{}".format(train_top_tokens))

INFO:root:Train top tokens:	[('the', 2768), ('i', 1904), ('you', 1844), ('food', 1663), ('thank', 1640), ('what', 1501), ('number', 1402), ('phone', 1370), ('restaurant', 1288), ('address', 1258), ('of', 1118)]


In [9]:
dev_X_axis = []
dev_Y_axis = []
dev_top_tokens = []
sorted_dict = sorted(dev_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(dev_top_tokens) <= 10:
        dev_top_tokens.append((token, token_count))
    if token_count > 40:
        dev_X_axis.append(token)
        dev_Y_axis.append(token_count)

assert(len(dev_X_axis) == len(dev_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = dev_X_axis,
                 y = dev_Y_axis,
                 marker = dict(color = "#3498db"))],
    "layout": Layout(title = "<b>DEV tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})


logging.info("Dev top tokens:\t{}".format(dev_top_tokens))

INFO:root:Dev top tokens:	[('the', 944), ('i', 701), ('you', 604), ('food', 572), ('thank', 523), ('what', 475), ('number', 423), ('restaurant', 416), ('phone', 414), ('of', 413), ('address', 388)]


In [10]:
test_X_axis = []
test_Y_axis = []
test_top_tokens = []
sorted_dict = sorted(test_vocabulary.items(), key = itemgetter(1), reverse = True)

for token, token_count in sorted_dict:
    if len(test_top_tokens) <= 10:
        test_top_tokens.append((token, token_count))
    if token_count > 70:
        test_X_axis.append(token)
        test_Y_axis.append(token_count)

assert(len(test_X_axis) == len(test_Y_axis)) 

plotly.offline.iplot({
    "data": [Bar(orientation = "v",
                 x = test_X_axis,
                 y = test_Y_axis,
                 marker = dict(color = "#9b59b6"))],
    "layout": Layout(title = "<b>TEST tokens distribution</b>",
                     xaxis = dict(title = "<b>Tokens</b>", dtick = 1),
                     yaxis = dict(title = "<b>Frequency</b>"),
                     margin = Margin(b = 180)
                    )
})


logging.info("Test top tokens:\t{}".format(test_top_tokens))

INFO:root:Test top tokens:	[('the', 2104), ('i', 1611), ('food', 1481), ('you', 1322), ('thank', 1167), ('what', 1067), ('restaurant', 1001), ('number', 981), ('phone', 956), ('of', 914), ('address', 883)]


In [11]:
train_dev_oov_tokens = []

for token, count in dev_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_dev_oov_tokens:
            train_dev_oov_tokens.append(token)

train_dev_oov_rate = len(train_dev_oov_tokens)/len(dev_vocabulary)

logging.info("Train-Dev OOV rate:\t{}\n".format(train_dev_oov_rate))
logging.info("Train-Dev OOV tokens:\t{}".format(train_dev_oov_tokens))

INFO:root:Train-Dev OOV rate:	0.21149425287356322

INFO:root:Train-Dev OOV tokens:	['allowed', 'quarter', 'venue', 'such', 'essentially', 'best', 'twenty', 'search', 'touch', 'move', 'typical', 'direct', 'comments', 'wo', 'noon', 'alone', 'americas', 'recognised', 'drink', 'exchange', 'wine', 'queen', 'nice', 'eclectic', 'third', 'found', 'worry', 'twelve', 'global', 'club', 'hold', 'drop', 'affordable', 'helpful', 'bridge', 'miles', 'available', 'ranges', 'bennys', 'connection', 'brb', 'goes', 'cow', "'re", 'two', 'point', 'mentioned', 'below', 'clowns', 'behind', 'cuisine', 'seen', 'answer', 'pitch', 'recommend', 'eleven', 'tight', 'children', "'ve", 'exact', 'theme', 'content', 'has', 'lasts', 'uno', 'come', 'piece', 'true', 'seem', 'millers', 'avenue', 'luck', 'around', 'g', 'zealand', 'intermediate', 'cars', 'meze', 'tour', 'tells', 'arrange', 'stores', 'info', 'still', 'shows', 'areas', 'meet', 'super', 'small', 'varsity', 'thing', 'signature']


In [12]:
train_test_oov_tokens = []

for token, count in test_vocabulary.items():
    if token not in train_vocabulary:
        if token not in train_test_oov_tokens:
            train_test_oov_tokens.append(token)

train_test_oov_rate = len(train_test_oov_tokens)/len(test_vocabulary)

logging.info("Train-Test OOV rate:\t{}\n".format(train_test_oov_rate))
logging.info("Train-Test OOV tokens:\t{}".format(train_test_oov_tokens))

INFO:root:Train-Test OOV rate:	0.29310344827586204

INFO:root:Train-Test OOV tokens:	['directly', 'tight', 'wo', 'fare', 'cow', 'benny', 'spend', 'includes', 'essentially', 'uno', 'quickly', 'two', 'excuse', 'make', 'suggestions', 'ho', 'eight', 'spent', 'spain', 'still', 'seen', 'up', 'county', "'re", 'p', 'such', 'meal', 'books', 'cocum', 'dear', 'wine', 'hong', 'third', 'hotel', 'storey', 'wow', 'advice', 'confirm', 'salad', 'comment', 'remember', 'h', 'nice', 'stops', 'useless', 'found', 'connect', 'cheapest', 'longer', 'kinds', 'itself', 'reference', 'tomorrow', 'continues', 'signature', 'thru', 'fill', 'kings', 'pool', 'union', 'twenty', 'has', 'movies', 'forty', 'helpful', 'hate', 'seem', 'names', 'pig', 'quick', 'green', 'straight', 'homerton', 'checking', 'beers', 'collecting', 'continue', 'nothing', 'feature', 'kettle', 'granta', 'selling', 'sheet', 'public', 'walk', 'charging', 'church', 'man', 'lucky', 'star', 'into', 'going', 'suggest', 'global', 'current', 'smallish', 'sp