In [1]:
from typing import List, Tuple, Any, Dict, Callable, Iterable
import json, os, sys
import pandas as pd
import numpy as np
import torch
import itertools
from collections import defaultdict, Counter
%matplotlib inline
import matplotlib.pyplot as plt

import datasets
import qanom
from qanom.annotations.common import read_annot_csv

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

def plot_bar(labels, array1, *args):
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.bar(labels, array1)
    ax.bar(labels, args[0], color='g')
    plt.show()
    # X = np.arange(4)
    # fig = plt.figure()
    # ax = fig.add_axes([0,0,1,1])
    # ax.bar(X + 0.00, array1, color = 'b', width = 0.25)
    # ax.bar(X + 0.25, data[1], color = 'g', width = 0.25)

In [2]:
# Prepare Helper Function: Segregate evaluation by column value
qanom_dataset = datasets.load_dataset("biu-nlp/qanom")
qanom_test_df = pd.DataFrame(qanom_dataset["test"])                              
from evaluation import run_qanom_evaluation

def evaluate_precision_by_column(predictions_df, column_name, take=None):
    values = predictions_df[column_name].unique().tolist()
    if take is not None:
        values = set(values) & set(take)
    eval_per_val = {} # {value : (UA precision, LA precision)}
    for val in values:
        part_pred_df = predictions_df[predictions_df[column_name]==val].copy()
        print(f"Evaluating for {column_name} == '{val}': (Notice that recall might be un-informative!)")
        eval_results = run_qanom_evaluation(part_pred_df, qanom_test_df.copy())
        print(eval_results[:2], "\n")  
        eval_per_val[val] = (eval_results[0].prec(), eval_results[1].prec())
    return eval_per_val  

No config specified, defaulting to: qanom/default
Reusing dataset qanom (/home/nlp/kleinay/.cache/huggingface/datasets/biu-nlp___qanom/default/1.1.0/44d54349c6d3f70e326208bf63485003c5410d38a6aae87eb80d74cf887627d0)


  0%|          | 0/3 [00:00<?, ?it/s]

## Scoring QA-level Condfidence

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("kleinay/qanom-seq2seq-model-joint")
tokenizer = AutoTokenizer.from_pretrained("kleinay/qanom-seq2seq-model-joint")

In [7]:
input_seq = "parse: yesterday , you<extra_id_10> took<extra_id_10> my hand with your fork to ask me out .<extra_id_1> took"
input_ids = tokenizer(input_seq, return_tensors="pt").input_ids
decoder_input_ids = tokenizer("<pad>", add_special_tokens=False, return_tensors="pt").input_ids
# decoder_input_ids = tokenizer("<pad> when did someone take something _ _?<extra_id_7> yesterday</s>",  
#                               add_special_tokens=False, return_tensors="pt").input_ids

In [8]:
model.branching_strategy = "standard_beam_search"
outputs = model.generate(input_ids, decoder_input_ids=decoder_input_ids, 
                         max_length=100,#len(decoder_input_ids[0]), 
                         output_scores=True,
                         num_beams=3,
                         num_return_sequences=3,
                         return_dict_in_generate=True)
# decoded_output = tokenizer.decode(outputs.sequences[2])
# print(decoded_output)
# decoded_output = tokenizer.decode(outputs.sequences[1])
# print(decoded_output)
decoded_output = tokenizer.decode(outputs.sequences[1])
print(decoded_output)

<pad> when did someone take something _ _?<extra_id_7> yesterday<extra_id_9> who _ _ took something _ _?<extra_id_7> you<extra_id_9> what did someone take _ _ _?<extra_id_7> my hand<extra_id_3> my hand with your fork<extra_id_9> why did someone take something _ _?<extra_id_7> to ask me out</s><pad><pad><pad><pad>


In [30]:
print(outputs.sequences_scores)
print(outputs.sequences_scores.exp())

tensor([-0.0379, -0.0514, -0.0531])
tensor([0.9628, 0.9499, 0.9483])


In [None]:
# Note: we only get `scores` for tokens that was generated during this decoidng, substracting tokens 
# that were given as input by `decoder_input_ids`. This can explain why "<pad>" is always not included.  
scores = torch.stack(outputs.scores)
scores = scores.transpose(0,1)
scores.shape

torch.Size([3, 74, 32101])

In [None]:
torch.stack(outputs.beam_indices[0]).shape

torch.Size([74])

In [9]:
# we should use this new functions to get transition probabilities:
trs_bs = model.compute_transition_beam_scores(
    sequences=outputs.sequences,
    scores=outputs.scores, 
    beam_indices=outputs.beam_indices
)
# Following https://github.com/huggingface/transformers/issues/15869
print("Summ:", torch.sum(trs_bs, dim=1), "Expected:", outputs.sequences_scores)
print("Sum/length:", torch.sum(trs_bs, dim=1)/len(outputs.beam_indices[0]), "Expected:", outputs.sequences_scores)


RuntimeError: The size of tensor a (6) must match the size of tensor b (74) at non-singleton dimension 1

In [None]:
len(outputs.beam_indices[0])

63

## Use Manual Posterior Computation

We have implemented a computation of "sum of log probabilites" in `QASRLSeq2SeqModel.get_sequence_score`.

In this section I will use it and try to come up with a method that estimates QA confidence using it.

In [3]:
from transformers import AutoConfig, AutoTokenizer
import sys, importlib
# sys.path.append("..")
import seq2seq_model
importlib.reload(seq2seq_model)
from seq2seq_model import QASRLSeq2SeqModel

model_name_or_path = "kleinay/qanom-seq2seq-model-joint"
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = QASRLSeq2SeqModel.from_pretrained(model_name_or_path,config=config)

### Compare to `sequence_score`

Let's test whether it is similar to the sequence_score returned from generate.


In [33]:

generated = model.generate(input_ids, max_length=120, return_dict_in_generate=True)
output_ids = torch.tensor(generated.sequences)
print(model.get_sequence_score(input_ids, output_ids[0][1:]))
print(model.get_sequence_score(input_ids, output_ids[1][1:]))
print(model.get_sequence_score(input_ids, output_ids[2][1:]))

  output_ids = torch.tensor(outputs.sequences)


tensor(0.9717)

In [34]:
print(model.get_sequence_score(input_ids, output_ids[0][1:]))
print(model.get_sequence_score(input_ids, output_ids[1][1:]))
print(model.get_sequence_score(input_ids, output_ids[2][1:]))

tensor(0.9717)
tensor(0.9618)
tensor(0.9594)


In [26]:
output_ids

tensor([[    0,   116,   410,   841,   240,   424,     3,   834,     3,   834,
             3,    58, 32092,  4981, 32090,   113,     3,   834,     3,   834,
           808,   424,     3,   834,     3,   834,     3,    58, 32092,    25,
         32090,   125,   410,   841,   240,     3,   834,     3,   834,     3,
           834,     3,    58, 32092,    82,   609,    28,    39,    21,   157,
         32090,   572,   410,   841,   240,   424,     3,   834,     3,   834,
             3,    58, 32092,    12,   987,   140,    91,     1,     0,     0,
             0,     0,     0,     0,     0],
        [    0,   116,   410,   841,   240,   424,     3,   834,     3,   834,
             3,    58, 32092,  4981, 32090,   113,     3,   834,     3,   834,
           808,   424,     3,   834,     3,   834,     3,    58, 32092,    25,
         32090,   125,   410,   841,   240,     3,   834,     3,   834,     3,
           834,     3,    58, 32092,    82,   609, 32096,    82,   609,    28,
       

### Evaluating QA confidence

How should we assess different confidence metrics?

* We can compute correlations (T test?) of confidence with accuracy (precision)
* At [this paper](https://aclanthology.org/W19-8671.pdf), also about confidence in seq2seq generation, they measure the percentage of errors within the 20%/10% of test samples getting the lowest condifence scores. E.g.,for MT, 10% least-confidence finds 17.66% of errors (the consider this a positive result).  In ASR, 10% -> 23.3% of errors.

In [6]:
# we will write a generic evaluation function that gets a DataFrame and adds a "confidence" column 
# by a certain `confidence_func`. Then, we will run evaluation on lowest 20% to see if precision is significantly lower.
# `confidence_func` would take an instance-level DataFrame with QAs of a single predicate.

def evaluate_confidence(predictions_df: pd.DataFrame, 
                        confidence_func: Callable[[pd.DataFrame], pd.DataFrame]):
    instance_dfs_with_confidence = []
    for _, instance_df in predictions_df.groupby(['qasrl_id', 'verb_idx']):
        with_confidence = confidence_func(instance_df)
        instance_dfs_with_confidence.append(with_confidence)
    df = pd.concat(instance_dfs_with_confidence, ignore_index=True)
    # df is same is `predictions_df`, with a new "confidence" column.
    
    # Mark least-confidence QAs
    confidence_array = df["confidence"].to_numpy()
    percentile = 20
    threshold_confidence = np.percentile(confidence_array, percentile)
    df["is_low_confidence"] = df["confidence"] <= threshold_confidence 
    # evalute by "is_low_confidence"
    evaluate_precision_by_column(df, "is_low_confidence", take=True)
    
    # More Comprehessive assessment of confidence function - 
    # split data to same-size buckets (e.g. `n_buckets=4` for quartiles)
    n_buckets = 4
    percentages = list(range(int(100/n_buckets), 100, int(100/n_buckets)))
    percentiles = np.percentile(confidence_array, percentages)
    confidence_bucket = np.digitize(confidence_array, percentiles) 
    df["confidence_bucket"] = confidence_bucket   
    prec_by_confidence_bucket = evaluate_precision_by_column(df, "confidence_bucket")
    # plot
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    uas = [prec_by_confidence_bucket[buck][0] for buck in range(n_buckets)]
    las = [prec_by_confidence_bucket[buck][1] for buck in range(n_buckets)]
    ax.bar(percentages + [100], uas, color='b')
    ax.bar(percentages + [100], las, color='g')
    plt.show()
    



### Confidence Functions

Each should take an instance-level DataFrame with QAs of a single predicate,
and return the same DataFrame with additional "confidence" columns.

	Ideas for confidence computations:
		1. Posterior Probability baselines:
			a) Min of QA tokens
			b) Mean of QA tokens
		2. Get score when feeding the QA to decoder as standalone sequence
		3. A baseline - score-diff: score(sequence) - score(sequence \ {QA})
		4. Mean of "score-diff" for all permutations
		5. Score of "generate another QA" decision - p(first token of QA) / p(EOS @ first token of QA)


In [None]:
from pipeline import get_markers_for_model
special_tokens = get_markers_for_model(is_t5_model=True)
def agg_of_QA_token_posteriors(qa_df: pd.DataFrame, agg_func) -> pd.DataFrame:
    # construct original output sequence
    answers = [a.replace("~!~", special_tokens.separator_output_answers) for a in qa_df.answer]
    raw_questions = qa_df.raw_question.tolist()
    output_seq = special_tokens.separator_output_pairs.join([f"{q}{special_tokens.separator_output_question_answer}{ans}"
                                                             for q,ans in zip(raw_questions, answers)])
    output_seq = tokenizer.pad_token + output_seq + tokenizer.eos_token
    # get token posteriors for all sequence
    
    # split by QA

In [19]:
# test
model_dir = "../trained_models/t5_qanom-joint-23.03.22"
prediction_df = pd.read_csv(model_dir + "/generated_predictions.csv")
raw_prediction_df = pd.read_csv(model_dir + "/raw_generated_predictions.csv")

# Counter([len(a.split("~!~")) for a in prediction_df.answer])
raw_prediction_df

Unnamed: 0,input,predicted output,gold output
0,parse: -LRB- -LRB- WN -RRB- -RRB- How often do...,<pad> who _ _ gets something _ _?<extra_id_7> ...,who _ _ gets _ _ somewhere?<extra_id_7> the Gl...
1,parse: -LRB- -LRB- WN -RRB- -RRB- I was<extra_...,<pad> who was _ looking _ _ somewhere?<extra_i...,who was _ looking _ around _?<extra_id_7> I<ex...
2,parse: -LRB- -LRB- WN -RRB- -RRB- That<extra_i...,<pad> what _ _ brings someone _ somewhere?<ext...,what _ _ brings someone to something?<extra_id...
3,parse: -LRB- -LRB- Wikinews -RRB- -RRB- When a...,<pad> who _ _ started something _ _?<extra_id_...,who _ _ started something _ _?<extra_id_7> Duc...
4,parse: -LSB-... -RSB- She may have<extra_id_10...,<pad> who might _ divided something _ _?<extra...,who _ _ divided something _ _?<extra_id_7> She...
...,...,...,...
995,parse: With the three-cylinder compound arrang...,<pad> what was _ set _ _ _?<extra_id_7> the LP...,where was something set _ at something?<extra_...
996,parse: With two-cylinder compounds<extra_id_10...,<pad> what is _ used _ _ _?<extra_id_7> two-cy...,what is _ used _ in something?<extra_id_7> two...
997,parse: Younger women today are far more likely...,<pad> who _ _ completed something _ _?<extra_i...,who might _ completed something _ _?<extra_id_...
998,parse:`` Benjamin Franklin... urged Voltaire t...,<pad> who _ _ agreed _ _ _?<extra_id_7> Voltai...,why did someone agree _ to something?<extra_id...
