# Baselines

In [1]:
from collections import Counter

import numpy as np
import sklearn
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('..')

from content import SELECTED_TOKENS
from icr.dataloader import CodrawData
from icr.config import args, LABEL_MAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sklearn.__version__

'1.1.2'

Load datasets for the drawer and the teller:

In [3]:
SPLITS = ('train', 'val', 'test')
ICR_LABEL = LABEL_MAP['icr']

MAX_ITER = 1000
MIN_FREQ = 2
UNK_TOKEN = '<UNK>'

In [4]:
results = {}

In [5]:
sys.argv = "-empty".split()
params = args()

params.path_to_codraw = '.' + params.path_to_codraw
params.path_to_annotation = '.' + params.path_to_annotation
params.path_to_preprocessed_imgs = '.' + params.path_to_preprocessed_imgs
params.path_to_preprocessed_texts = '.' + params.path_to_preprocessed_texts

params.task = 'drawer'
datasets_drawer = {split: CodrawData(split, params) for split in SPLITS}

params.task = 'teller'
datasets_teller = {split: CodrawData(split, params) for split in SPLITS}

------------------------------
 Loaded train set with:
   62067 datapoints
   7989 dialogues
   7016 (11.30%) clarifications
   55051 (88.70%) other 
------------------------------
------------------------------
 Loaded val set with:
   7714 datapoints
   1002 dialogues
   920 (11.93%) clarifications
   6794 (88.07%) other 
------------------------------
------------------------------
 Loaded test set with:
   7721 datapoints
   1002 dialogues
   871 (11.28%) clarifications
   6850 (88.72%) other 
------------------------------
------------------------------
 Loaded train set with:
   62067 datapoints
   7989 dialogues
   7016 (11.30%) clarifications
   55051 (88.70%) other 
------------------------------
------------------------------
 Loaded val set with:
   7714 datapoints
   1002 dialogues
   920 (11.93%) clarifications
   6794 (88.07%) other 
------------------------------
------------------------------
 Loaded test set with:
   7721 datapoints
   1002 dialogues
   871 (11.28%) cl

Auxiliary functions to make the predictions and compute the two evaluation metrics:

In [6]:
def predict(model, X):
    predictions = model.predict(X)
    probs = model.predict_proba(X)
    return predictions, probs
    

def evaluate(target, predictions, probs_icr):
    macro_f1 = metrics.f1_score(target, predictions, average='macro')
    avp = metrics.average_precision_score(target, probs_icr)
    return {'macro_f1': macro_f1, 'avp': avp}

## Trivial Baseline 1: Utterance features as input

We train a Logistic Regression model on simple handcrafted features and use it as the first trivial baseline.

### Drawer

For the drawer, we use the length of the teller's utterance and a binary BOW representation.

In [7]:
def build_vec(utterance, vocab):
    indexed_utterance = [vocab[word] if word in vocab else vocab[UNK_TOKEN] for word in utterance]
    return [1 if position in indexed_utterance else 0 for position in range(len(vocab)) ]

def build_features_drawer(dataset, vocab):
    assert dataset.task == 'drawer'
    X = []
    Y = []
    for idx, (dialogue_id, turn) in dataset.datapoints.items():
        *_, label = dataset[idx]
        game = dataset.games[dialogue_id]
        utterance = game.dialogue.teller_turns[turn].split()
        # build features: length and binary BOW 
        X.append([len(utterance)] + build_vec(utterance, vocab))
        Y.append(label)
    return np.array(X), np.array(Y)

Create the vocabulary from the training set. We include only tokens that occur at least two times:

In [8]:
counter_teller_train = Counter()
for game in datasets_drawer['train'].games.values():
    for turn in game.dialogue.teller_turns:
        counter_teller_train.update(turn.split())

assert UNK_TOKEN not in counter_teller_train
vocab_teller_train = [word for (word, count) in counter_teller_train.items() if count >= MIN_FREQ] + [UNK_TOKEN]
word2id = {word: i for i, word in enumerate(vocab_teller_train)}

In [9]:
X_train, Y_train = build_features_drawer(datasets_drawer['train'], word2id)
X_val, Y_val = build_features_drawer(datasets_drawer['val'], word2id)
X_test, Y_test = build_features_drawer(datasets_drawer['test'], word2id)

In [10]:
baseline = LogisticRegression(random_state=0, class_weight='balanced', max_iter=MAX_ITER)
baseline.fit(X_train, Y_train)

predictions, probs = predict(baseline, X_val)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'features', 'val')]= {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(baseline, X_test)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'features', 'test')]= {**evaluate(Y_test, predictions, probs_icr)}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print('Validation: ', results[('drawer', 'features', 'val')])
print('Test:       ', results[('drawer', 'features', 'test')])

Validation:  {'macro_f1': 0.5316873922453322, 'avp': 0.20662440606077712}
Test:        {'macro_f1': 0.5188309960070748, 'avp': 0.19581780263105944}


### Teller

For the teller, we use the length of the drawer's utterance and a flag for predefined content words:

In [12]:
def build_features_teller(dataset):
    assert dataset.task == 'teller'
    X, Y = [], []
    for idx, (dialogue_id, turn) in dataset.datapoints.items():
        *_, label = dataset[idx]
        game = dataset.games[dialogue_id]
        utterance = game.dialogue.drawer_turns[turn].split()
        has_content = 1 if set(utterance).intersection(SELECTED_TOKENS) else 0
        # build features: number of tokens and a flag for content tokens
        X.append([len(utterance), has_content])
        Y.append(label)
    return np.array(X), np.array(Y)

In [13]:
X_train, Y_train = build_features_teller(datasets_teller['train'])
X_val, Y_val = build_features_teller(datasets_teller['val'])
X_test, Y_test = build_features_teller(datasets_teller['test'])

In [14]:
baseline = LogisticRegression(random_state=0, class_weight='balanced', max_iter=MAX_ITER)
baseline.fit(X_train, Y_train)

predictions, probs = predict(baseline, X_val)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'features', 'val')]= {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(baseline, X_test)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'features', 'test')]= {**evaluate(Y_test, predictions, probs_icr)}

In [15]:
print('Validation: ', results[('teller', 'features', 'val')])
print('Test:       ', results[('teller', 'features', 'test')])

Validation:  {'macro_f1': 0.8588035508320644, 'avp': 0.6874505958715378}
Test:        {'macro_f1': 0.8552709579197477, 'avp': 0.6874181339274107}


## Trivial Baseline 2: Real embeddings as input

We train a Logistic Regression model on the same input embeddings and use it as the second trivial baseline.

In [16]:
def build_input_vectors(dataset):
    X_context = []
    X_last_msg = []
    X_img = []
    Y = []
    for idx, (dialogue_id, turn) in dataset.datapoints.items():
        _, context, last_msg, img, label = dataset[idx]
        # build features
        X_last_msg.append(last_msg.tolist())
        X_context.append(context.tolist())
        X_img.append(img.tolist())
        Y.append(label)
    full_representation = np.concatenate([np.array(X_context), np.array(X_last_msg), np.array(X_img)], axis=1)
    return full_representation, np.array(Y)

### Drawer

In [17]:
X_train_all, Y_train = build_input_vectors(datasets_drawer['train'])
X_val_all, Y_val = build_input_vectors(datasets_drawer['val'])
X_test_all, Y_test = build_input_vectors(datasets_drawer['test'])

In [18]:
baseline = LogisticRegression(random_state=0, class_weight='balanced', max_iter=MAX_ITER, verbose=1)
baseline.fit(X_train_all, Y_train)

predictions, probs = predict(baseline, X_val_all)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'representations', 'val')]= {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(baseline, X_test_all)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'representations', 'test')]= {**evaluate(Y_test, predictions, probs_icr)}

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3585     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.30216D+04    |proj g|=  9.49739D+02

At iterate   50    f=  3.66305D+04    |proj g|=  2.05091D+03

At iterate  100    f=  3.48539D+04    |proj g|=  1.53278D+03

At iterate  150    f=  3.40219D+04    |proj g|=  1.20084D+03

At iterate  200    f=  3.35152D+04    |proj g|=  7.00009D+02

At iterate  250    f=  3.29758D+04    |proj g|=  1.52217D+03

At iterate  300    f=  3.26114D+04    |proj g|=  1.12622D+02

At iterate  350    f=  3.23680D+04    |proj g|=  3.72251D+02

At iterate  400    f=  3.21307D+04    |proj g|=  1.67465D+02

At iterate  450    f=  3.19325D+04    |proj g|=  6.98250D+02

At iterate  500    f=  3.16736D+04    |proj g|=  1.08437D+02

At iterate  550    f=  3.15090D+04    |proj g|=  2.01151D+02

At iterate  600    f=  3.13805D+04    |proj g|=  3.41389D+02

At iterate  650    f=  3.1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.6min finished


In [19]:
print('Validation: ', results[('drawer', 'representations', 'val')])
print('Test:       ', results[('drawer', 'representations', 'test')])

Validation:  {'macro_f1': 0.5878799867882846, 'avp': 0.3247402864118646}
Test:        {'macro_f1': 0.5766599983865733, 'avp': 0.28715655344114893}


### Teller

In [20]:
X_train_all, Y_train = build_input_vectors(datasets_teller['train'])
X_val_all, Y_val = build_input_vectors(datasets_teller['val'])
X_test_all, Y_test = build_input_vectors(datasets_teller['test'])

In [21]:
baseline = LogisticRegression(random_state=0, class_weight='balanced', max_iter=MAX_ITER, verbose=1)
baseline.fit(X_train_all, Y_train)

predictions, probs = predict(baseline, X_val_all)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'representations', 'val')] = {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(baseline, X_test_all)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'representations', 'test')] = {**evaluate(Y_test, predictions, probs_icr)}

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3585     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  4.30216D+04    |proj g|=  1.61178D+03

At iterate   50    f=  3.90926D+03    |proj g|=  4.56677D+01

At iterate  100    f=  3.22936D+03    |proj g|=  2.71992D+02

At iterate  150    f=  2.72263D+03    |proj g|=  1.11848D+02

At iterate  200    f=  2.56824D+03    |proj g|=  1.03741D+01

At iterate  250    f=  2.50801D+03    |proj g|=  1.93262D+01

At iterate  300    f=  2.49093D+03    |proj g|=  1.12515D+01

At iterate  350    f=  2.48535D+03    |proj g|=  1.65014D+01

At iterate  400    f=  2.48221D+03    |proj g|=  3.20542D+00

At iterate  450    f=  2.48051D+03    |proj g|=  1.43569D+01

At iterate  500    f=  2.47924D+03    |proj g|=  9.70877D+00

At iterate  550    f=  2.47889D+03    |proj g|=  5.12401D-01

At iterate  600    f=  2.47872D+03    |proj g|=  8.72673D-01

At iterate  650    f=  2.4

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min finished


In [22]:
print('Validation: ', results[('teller', 'representations', 'val')])
print('Test:       ', results[('teller', 'representations', 'test')])

Validation:  {'macro_f1': 0.9627892439474388, 'avp': 0.9840569088144125}
Test:        {'macro_f1': 0.9615280590549525, 'avp': 0.9780352120179174}


## Random Baseline

### Drawer

In [23]:
X_train_all, Y_train = build_input_vectors(datasets_drawer['train'])
X_val_all, Y_val = build_input_vectors(datasets_drawer['val'])
X_test_all, Y_test = build_input_vectors(datasets_drawer['test'])

In [24]:
random_baseline = DummyClassifier(strategy="stratified", random_state=123)
random_baseline.fit(X_train_all, Y_train)

predictions, probs = predict(random_baseline, X_val_all)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'random', 'val')]= {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(random_baseline, X_test_all)
probs_icr = probs[:, ICR_LABEL]
results[('drawer', 'random', 'test')]= {**evaluate(Y_test, predictions, probs_icr)}

In [25]:
print('Validation: ', results[('drawer', 'random', 'val')])
print('Test:       ', results[('drawer', 'random', 'test')])

Validation:  {'macro_f1': 0.4890836492828434, 'avp': 0.117471253645027}
Test:        {'macro_f1': 0.5036599508928662, 'avp': 0.11357712468378736}


### Teller

In [26]:
X_train_all, Y_train = build_input_vectors(datasets_teller['train'])
X_val_all, Y_val = build_input_vectors(datasets_teller['val'])
X_test_all, Y_test = build_input_vectors(datasets_teller['test'])

In [27]:
random_baseline = DummyClassifier(strategy="stratified", random_state=123)
random_baseline.fit(X_train_all, Y_train)

predictions, probs = predict(random_baseline, X_val_all)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'random', 'val')]= {**evaluate(Y_val, predictions, probs_icr)}

predictions, probs = predict(random_baseline, X_test_all)
probs_icr = probs[:, ICR_LABEL]
results[('teller', 'random', 'test')]= {**evaluate(Y_test, predictions, probs_icr)}

In [28]:
print('Validation: ', results[('teller', 'random', 'val')])
print('Test:       ', results[('teller', 'random', 'test')])

Validation:  {'macro_f1': 0.4890836492828434, 'avp': 0.117471253645027}
Test:        {'macro_f1': 0.5036599508928662, 'avp': 0.11357712468378736}


## Print table for the paper

In [29]:
latex_table = ''

for baseline in ('random', 'features', 'representations'):
    for split  in ('val', 'test'):
        outputs = results[('drawer', baseline, split)]
        latex_table += f'{baseline} & {split} & {outputs["avp"]:.5f} & {outputs["macro_f1"]:.5f}'
        outputs = results[('teller', baseline, split)]
        latex_table += f' & {outputs["avp"]:.5f} & {outputs["macro_f1"]:.5f} \\\\ \n'

print(latex_table)

random & val & 0.11747 & 0.48908 & 0.11747 & 0.48908 \\ 
random & test & 0.11358 & 0.50366 & 0.11358 & 0.50366 \\ 
features & val & 0.20662 & 0.53169 & 0.68745 & 0.85880 \\ 
features & test & 0.19582 & 0.51883 & 0.68742 & 0.85527 \\ 
representations & val & 0.32474 & 0.58788 & 0.98406 & 0.96279 \\ 
representations & test & 0.28716 & 0.57666 & 0.97804 & 0.96153 \\ 



## Pretrained dialogue embeddings

Can the last embedding before the peek action predict whether a CR occurred?

We have not computed the embedding after the last drawer's message, because it's a context we neved use in training. So we either just use the very last context embedding we have (i.e., after the last teller's message), or we try to get the one at the peek action (but then not all dialogues have a peek action).

**Version 1**: use the very last embedding we have. Here, we have to exclude rare cases where the only iCR is exactly at the last turn, because then the embedding is no including it.

In [30]:
def build_last_context(dataset):
    X = []
    Y = []
    for idx, game in dataset.games.items(): 
        if game.dialogue.icr_turns == [game.n_turns - 1]:
            # corner cases where the only iCR in the dialogue occurs in the very last turn
            # we don't have the representation after the last drawer's message,
            # which would be necessary for a correct prediction in these cases
            # so we exclude them
            # this should be a very rare event
            print(f'Excluded game {idx}!')
            continue
        
        contains_cr = 1 if game.dialogue.icr_turns else 0

        # the last context in our embeddings is the state of the dialogue in the last turn, after the
        # teller's utterances but before the drawer's utterances (position 1)
        last_context = dataset.embs.contexts[idx][-1][1]
        X.append(last_context)
        Y.append(contains_cr)
    return np.array(X), np.array(Y)

In [31]:
X_train, Y_train = build_last_context(datasets_drawer['train'])
X_val, Y_val = build_last_context(datasets_drawer['val'])
X_test, Y_test = build_last_context(datasets_drawer['test'])

Excluded game 6257!
Excluded game 7395!
Excluded game 3911!
Excluded game 5455!
Excluded game 3580!
Excluded game 7403!
Excluded game 4045!
Excluded game 2320!
Excluded game 623!
Excluded game 6554!
Excluded game 3574!
Excluded game 1652!
Excluded game 2326!
Excluded game 8806!
Excluded game 1700!
Excluded game 2930!
Excluded game 2276!
Excluded game 5066!
Excluded game 3868!
Excluded game 2658!
Excluded game 4638!
Excluded game 8168!
Excluded game 2059!
Excluded game 2779!


In [32]:
baseline = LogisticRegression(random_state=0, class_weight='balanced')
baseline.fit(X_train, Y_train)

predictions, probs = predict(baseline, X_val)
probs_icr = probs[:, ICR_LABEL]
print('Validation: ', {**evaluate(Y_val, predictions, probs_icr)})

predictions, probs = predict(baseline, X_test)
probs_icr = probs[:, ICR_LABEL]
print('Test:       ', {**evaluate(Y_test, predictions, probs_icr)})

Validation:  {'macro_f1': 0.8374564168956692, 'avp': 0.8912296111538317}
Test:        {'macro_f1': 0.8408753294137334, 'avp': 0.8908863103864789}


**Version 2 (used in the paper)**: We exclude dialogues that do not contain a peek action, so that we can get the representation containing all utterances until the peek (i.e., the context at position 0 at peek turn, which includes the last drawer's utterance).

In [33]:
def build_context_at_peek(dataset):
    X = []
    Y = []
    for idx, game in dataset.games.items(): 
        
        # exclude dialogues without peek
        if game.peek_turn is None:
            continue
        
        turn = game.peek_turn
        
        contains_cr = 1 if game.dialogue.icr_turns_before_peek else 0

        # the state at position 0 at the peek_turn contains the full dialogue, until the last turn before
        # the peek
        last_context = dataset.embs.contexts[idx][turn][0]
        X.append(last_context)
        Y.append(contains_cr)

    return np.array(X), np.array(Y)

In [34]:
X_train, Y_train = build_context_at_peek(datasets_drawer['train'])
X_val, Y_val = build_context_at_peek(datasets_drawer['val'])
X_test, Y_test = build_context_at_peek(datasets_drawer['test'])

In [35]:
baseline = LogisticRegression(random_state=0, class_weight='balanced')
baseline.fit(X_train, Y_train)

predictions, probs = predict(baseline, X_val)
probs_icr = probs[:, ICR_LABEL]
print('Validation: ', {**evaluate(Y_val, predictions, probs_icr)})

predictions, probs = predict(baseline, X_test)
probs_icr = probs[:, ICR_LABEL]
print('Test:       ', {**evaluate(Y_test, predictions, probs_icr)})

Validation:  {'macro_f1': 0.860035136384651, 'avp': 0.9156948829431442}
Test:        {'macro_f1': 0.8569327484838951, 'avp': 0.9037990527661058}


**Version 3**: we use the very last embedding we have before the peek or, if there is no peek, then the last embedding we have for the dialogue. Similar to version 1, we are not including the very last drawer's utterance.

In [36]:
def build_last_context_mixed(dataset):
    X = []
    Y = []

    for idx, game in dataset.games.items():
        
        if game.peek_turn is None:
            assert game.dialogue.icr_turns == game.dialogue.icr_turns_before_peek
        
        turn = game.peek_turn - 1 if game.peek_turn is not None else game.n_turns - 1
        
        if game.dialogue.icr_turns_before_peek == [turn]:
            # corner cases where the only iCR in the dialogue before peek occurs in the very last turn
            # we don't have the representation after the last drawer's message,
            # which would be necessary for a correct prediction in these cases
            # so we exclude them
            # this should be a rare event
            print(f'Excluded game {idx}!')
            continue
        
        contains_cr = 1 if game.dialogue.icr_turns_before_peek else 0

        # the last context in our embeddings is the state of the dialogue in the last turn, after the
        # teller's utterances but before the drawer's utterances
        last_context = dataset.embs.contexts[idx][turn][1]
        X.append(last_context)
        Y.append(contains_cr)

    return np.array(X), np.array(Y)

In [37]:
X_train, Y_train = build_last_context_mixed(datasets_drawer['train'])
X_val, Y_val = build_last_context_mixed(datasets_drawer['val'])
X_test, Y_test = build_last_context_mixed(datasets_drawer['test'])

Excluded game 782!
Excluded game 9367!
Excluded game 3220!
Excluded game 84!
Excluded game 7236!
Excluded game 2655!
Excluded game 8273!
Excluded game 9686!
Excluded game 7547!
Excluded game 5675!
Excluded game 5184!
Excluded game 4243!
Excluded game 3580!
Excluded game 4045!
Excluded game 4117!
Excluded game 7834!
Excluded game 4624!
Excluded game 1881!
Excluded game 6680!
Excluded game 9884!
Excluded game 2927!
Excluded game 3367!
Excluded game 5751!
Excluded game 8154!
Excluded game 1122!
Excluded game 7871!
Excluded game 6344!
Excluded game 1174!
Excluded game 3005!
Excluded game 465!
Excluded game 3385!
Excluded game 5734!
Excluded game 5242!
Excluded game 2577!
Excluded game 2087!
Excluded game 8934!
Excluded game 6556!
Excluded game 8307!
Excluded game 7453!
Excluded game 8530!
Excluded game 7431!
Excluded game 7360!
Excluded game 737!
Excluded game 9174!
Excluded game 1406!
Excluded game 1404!
Excluded game 5804!
Excluded game 3601!
Excluded game 3026!
Excluded game 5516!
Exclu

In [38]:
baseline = LogisticRegression(random_state=0, class_weight='balanced')
baseline.fit(X_train, Y_train)

predictions, probs = predict(baseline, X_val)
probs_icr = probs[:, ICR_LABEL]
print('Validation: ', {**evaluate(Y_val, predictions, probs_icr)})

predictions, probs = predict(baseline, X_test)
probs_icr = probs[:, ICR_LABEL]
print('Test:       ', {**evaluate(Y_test, predictions, probs_icr)})

Validation:  {'macro_f1': 0.8683673106113321, 'avp': 0.9190822757528802}
Test:        {'macro_f1': 0.8527688899827714, 'avp': 0.9099675442767706}
