In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [None]:
from os.path import exists

<a id="anchor_metafields"></a>
# Meta-fields

<a id="anchor_metafields_dataseta"></a>
## [DatasetA](#anchor_dataseta) fields

In [None]:
### The path to the full dataset with all relevant fields
DATASETA_PATH = "data-compas/compas-scores-two-years.csv"

### The decision source to explain, needs to be 0/1 split
DATASETA_DECISION = "2YCOMPAS" # Original dataset "low-risk/high-risk" threshold split
# DATASETA_DECISION = "RACIST" # Explicitly judging by race group only, with the same % low/high split
# [TODO] "RACIST" with the same accuracy?

### Name of column to use for decision made
DATASETA_DECISION_COLNAME = "ncol_decision"

### Name of existing column to use in identification
DATASETA_ID_COLNAME = "id"

### Seed to use in 'ref'(reference,train)/'evl'(evaluate,test) split sampling
DATASETA_SPLIT_SEED = 1
### Ratio of 'ref'/'evl' split sampling
DATASETA_SPLIT_RATIO = 0.8

<a id="anchor_metafields_datasetb"></a>
## [DatasetB](#anchor_datasetb) fields

In [None]:
### Assume that we are always doing "statistically significant compared to
### general population" type justifications

### Maximum number of factors to include in the whole-dataset evaluation
DATASETB_MAX_FIELDS = 2
### Fields accessible to use in rationalization?
### ... some of which are VERY unfair and/or illegal
DATASETB_JUSTIF_FIELDS = [
    # 'sex', # protected trait
    # 'race', # protected trait
    # 'age', # protected trait
    # 'age_cat',
    'juv_fel_count',
    'juv_misd_count',
    'juv_other_count',
    'priors_count',
    'c_charge_degree',
    'c_charge_desc',
]

### Name of column to use for justification
### Should be a dataset source column name... [TODO] update that if needed
DATASETB_EVIDENCE_COLNAME = "two_year_recid"

### Threshold for confidence range
DATASETB_CONF_ALPHA = 0.05

<a id="anchor_metafields_justify"></a>
## [Justify](#anchor_justify) fields

In [None]:
### Method to use for justification assignment
JUSTIFY_APPROACH = "MIN_REFSIZE_AGREE" # 1. agreeable justification with min ref sample size, 2. 'None'
# JUSTIFY_APPROACH = "MIN_REFSIZE_ANY" # 1. agreeable justification with min ref sample size, 2. any justification with min ref size, 3. 'None'

<a id="anchor_metafields_evaluate"></a>
## [Evaluate](#anchor_evaluate) fields

<a id="anchor_dataseta"></a>
# DatasetA: what decisions were made?

[Relevant metafields](#anchor_metafields_dataseta)

In [None]:
# see: https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
def dataseta_compas_filter(df):
    df = df[df["days_b_screening_arrest"] >= -30]
    df = df[df["days_b_screening_arrest"] <= 30]
    df = df[df["is_recid"] != -1]
    df = df[df["c_charge_degree"] != "O"]
    df = df[df["score_text"] != "N/A"]
    return df

In [None]:
def dataseta_decision_compas(df):
    return df["is_recid"]

In [None]:
def dataseta_decision_racist(df):
    # Match for the number of positive predictions
    num_pos_pred = sum(df["is_recid"].values)
    # sort by race, apply positives from 1->0 in alphabetic race order
    df = df.sort_values("race")
    df["temp"] = [
        (1 if i<num_pos_pred else 0) 
        for i in range(len(df))
    ]
    # re-order to match original dataset index
    df = df.sort_index()
    return df["temp"]

In [None]:
# Import dataset
rawsrc_df = pd.read_csv(DATASETA_PATH)

# Filter / preprocess the dataset to remove edge cases
decisions_df = dataseta_compas_filter(rawsrc_df)

# Add model decisions as a custom named column
decisions_ref = {
    "2YCOMPAS": dataseta_decision_compas,
    "RACIST": dataseta_decision_racist,
}
decisions_df[DATASETA_DECISION_COLNAME] = decisions_ref[DATASETA_DECISION](decisions_df)

# [TODO] backup

# Do ref/evl (train/test) split
decisions_ref_df = decisions_df.sample(
    n=int(DATASETA_SPLIT_RATIO*len(decisions_df)),
    random_state=DATASETA_SPLIT_SEED,
)
decisions_evl_df = decisions_df[
    ~decisions_df.index.isin(decisions_ref_df.index)
]

# Print preview of the dataset
decisions_df.shape
decisions_ref_df.shape
decisions_ref_df[:3]
decisions_evl_df.shape
decisions_evl_df[:3]

In [None]:
# [TODO visualize the dataset itself?]

<a id="anchor_datasetb"></a>
# DatasetB: what explanations could be used?

[Relevant metafields](#anchor_metafields_datasetb)

In [None]:
# returns true iff the intervals [a1, a2] and [b1, b2] overlap
def interval_overlaps(a1, a2, b1, b2):
    # Input bounds checking
    if a1>a2:
        raise ValueError("unexpected input bounds: a1>a2")
    if b1>b2:
        raise ValueError("unexpected input bounds: b1>b2")
    # Actual computation
    if a1<b1 and a2<b1:
        return False
    if a1>b2 and a2>b2:
        return False
    return True

In [None]:
from statsmodels.stats import proportion

# define a function that takes a ref_df and a evl_df, 
# list of usable columns, number fields per justification, 
# and evidence column name, and computes the set of all possible column 
# field combinations in evl_df, and each of their set significances in ref_df
# Be sure to include some calculation of how applicable each justification
# is to each case in evl_df
def datasetb_popsignificant(
    ref_df, evl_df, 
    justif_fields, num_fields, evidence_colname, conf_alpha, 
    id_colname
):
    # compute population uncertainty interval
    ref_values = ref_df[evidence_colname]
    ref_average = sum(ref_values) / len(ref_values)
    ref_interval = proportion.proportion_confint(
        sum(ref_values), len(ref_values),
        alpha=conf_alpha, method="beta",
    )
    # calculate every possible combination of justification field types
    justif_field_idxs = [[i] for i in range(len(justif_fields))]
    for _ in range(num_fields-1):
        justif_field_idxs = [
            [f+[i] for i in range(len(justif_fields)) if i>f[-1]]
            for f in justif_field_idxs
        ]
        justif_field_idxs = [e for sl in justif_field_idxs for e in sl]
    justif_field_types = [
        [justif_fields[i] for i in fi]
        for fi in justif_field_idxs
    ]
    justif_df = pd.DataFrame()
    justif_relevant_df = pd.DataFrame()
    # calculate every usable justification field key (apply types -> evl_df)
    # also log which evl_df IDs are relevant to it
    for jt in justif_field_types:
        for evl_sg_justifval, evl_sg_df in evl_df.groupby(by=jt):
            # generate the key...
            evl_sg_justifkey = (
                tuple(jt), 
                (evl_sg_justifval,) if len(jt)==1 else tuple(evl_sg_justifval)
            )
            # pick out relevant ref subgroup
            ref_sg_df = ref_df
            for i in range(len(evl_sg_justifkey[0])):
                ref_sg_df = ref_sg_df[ref_sg_df[evl_sg_justifkey[0][i]] == evl_sg_justifkey[1][i]]
            if len(ref_sg_df) == 0:
                continue
            # compute subgroup uncertainty interval
            ref_sg_values = ref_sg_df[evidence_colname]
            evl_sg_values = evl_sg_df[evidence_colname]
            ref_sg_average = sum(ref_sg_values) / len(ref_sg_values)
            evl_sg_average = sum(evl_sg_values) / len(evl_sg_values)
            ref_sg_interval = proportion.proportion_confint(
                sum(ref_sg_values), len(ref_sg_values),
                alpha=conf_alpha, method="beta",
            )
            # Skip non-significant comparisons
            if interval_overlaps(
                ref_interval[0], ref_interval[1], 
                ref_sg_interval[0], ref_sg_interval[1]
            ):
                continue
            # Save justification details, applicability details
            # append single row for new justification
            justif_df = justif_df.append({
                "justif_key": evl_sg_justifkey,
                "justif_key_numfields": len(jt),
                "justif_premise": 1 if (ref_sg_average>ref_average) else 0,
                "ref_mean": ref_sg_average,
                "ref_samplesize": len(ref_sg_df),
                "ref_conf_bot": ref_sg_interval[0],
                "ref_conf_top": ref_sg_interval[1],
                "evl_mean": evl_sg_average,
                "evl_samplesize": len(evl_sg_df),
            }, ignore_index=True)
            # concat an entire new temp-df for all relevant key-ID pairings
            justif_relevant_df = pd.concat([
                justif_relevant_df, 
                pd.DataFrame({
                    "justif_key": [evl_sg_justifkey for _ in range(len(evl_sg_df))],
                    "evl_id": evl_sg_df[id_colname],
                })
            ], ignore_index=True)
    return justif_df, justif_relevant_df

In [None]:
temp = decisions_df[decisions_df["id"]==2680]
temp

temp_justif_df = pd.DataFrame()
temp_justif_relevant_df = pd.DataFrame()
for i_num_fields in range(1, DATASETB_MAX_FIELDS+1):
    temp_a, temp_b = datasetb_popsignificant(
        decisions_ref_df, temp,
        DATASETB_JUSTIF_FIELDS, i_num_fields, DATASETB_EVIDENCE_COLNAME, DATASETB_CONF_ALPHA,
        DATASETA_ID_COLNAME
    )
    temp_justif_df = pd.concat([temp_justif_df, temp_a], ignore_index=True)
    temp_justif_relevant_df = pd.concat([temp_justif_relevant_df, temp_b], ignore_index=True)

temp_justif_df.shape
temp_justif_df[:5]
temp_justif_relevant_df.shape
temp_justif_relevant_df[:5]

In [None]:
justif_df = pd.DataFrame()
justif_relevant_df = pd.DataFrame()
for i_num_fields in range(1, DATASETB_MAX_FIELDS+1):
    temp_a, temp_b = datasetb_popsignificant(
        decisions_ref_df, decisions_evl_df,
        DATASETB_JUSTIF_FIELDS, i_num_fields, DATASETB_EVIDENCE_COLNAME, DATASETB_CONF_ALPHA,
        DATASETA_ID_COLNAME
    )
    justif_df = pd.concat([justif_df, temp_a], ignore_index=True)
    justif_relevant_df = pd.concat([justif_relevant_df, temp_b], ignore_index=True)

justif_df.shape
justif_df[:5]
justif_relevant_df.shape
justif_relevant_df[:5]

In [None]:
temp_breakdown = decisions_ref_df[DATASETB_EVIDENCE_COLNAME]
print('true recidivism (ref):', sum(temp_breakdown)/len(temp_breakdown))
print('true recidivism (ref) range:', proportion.proportion_confint(
    sum(temp_breakdown), len(temp_breakdown), alpha=DATASETB_CONF_ALPHA, method='beta'
))

print()

temp_breakdown = decisions_evl_df[DATASETB_EVIDENCE_COLNAME]
print('true recidivism (evl):', sum(temp_breakdown)/len(temp_breakdown))
print('true recidivism (evl) range:', proportion.proportion_confint(
    sum(temp_breakdown), len(temp_breakdown), alpha=DATASETB_CONF_ALPHA, method='beta'
))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# visualize a sample of the rationalization bounds
def vis_ratbounds(ref_df, evl_df, justif_df, justif_rel_df, num_eachside, max_factors):
    # change graph size settings
    temp_rcparams_figsize = plt.rcParams["figure.figsize"]
    plt.rcParams["figure.figsize"] = [15, 5]
    plt_gapsize = 10
    
    # visualize the 0-pred cases
    for n in range(num_eachside):
        target = evl_df[evl_df[DATASETA_DECISION_COLNAME] == 0].sample(n=1).iloc[0]
        target_justifs = justif_rel_df[justif_rel_df["evl_id"] == target[DATASETA_ID_COLNAME]]
        target_justifs = target_justifs.merge(
            justif_df[justif_df["justif_key_numfields"] <= max_factors],
            how="inner", on=["justif_key"], validate="1:1",
        )
        # mean prediction for justifications
        target_justifs_grouped = pd.DataFrame()
        for i in range(1, max_factors+1):
            temp_revised = pd.DataFrame(target_justifs[target_justifs["justif_key_numfields"] <= i])
            temp_revised["justif_key_numfields"] = [i for _ in temp_revised["justif_key_numfields"]]
            target_justifs_grouped = pd.concat([
                target_justifs_grouped,
                temp_revised,
            ], ignore_index=True)
        dots = plt.scatter(
            [(2*i)+(2*max_factors*n) for i in target_justifs_grouped["justif_key_numfields"]],
            target_justifs_grouped["ref_mean"],
            alpha=[(s/max(justif_df["ref_samplesize"]))**0.9 for s in target_justifs_grouped["ref_samplesize"]],
        )
        # confidence interval gap between most extreme justifications on both sides
        for i in range(1, max_factors+1):
            target_justifs_i = target_justifs[target_justifs["justif_key_numfields"] <= i]
            if len(target_justifs_i) == 0:
                continue
            range_lo = min(target_justifs_i["ref_conf_top"])
            range_hi = max(target_justifs_i["ref_conf_bot"])
            if range_lo <= range_hi:
                pos_x = (2*i)+(2*max_factors*n)+1
                _ = plt.plot(
                    [pos_x, pos_x],
                    [range_lo, range_hi],
                    color=dots.get_facecolors()[0][:-1],
                    marker="",
                )
    # visualize the 1-pred cases
    plt_1predgap = (2*max_factors*num_eachside)+plt_gapsize
    for n in range(num_eachside):
        target = evl_df[evl_df[DATASETA_DECISION_COLNAME] == 1].sample(n=1).iloc[0]
        target_justifs = justif_rel_df[justif_rel_df["evl_id"] == target[DATASETA_ID_COLNAME]]
        target_justifs = target_justifs.merge(
            justif_df[justif_df["justif_key_numfields"] <= max_factors],
            how="inner", on=["justif_key"], validate="1:1",
        )
        # mean prediction for justifications
        target_justifs_grouped = pd.DataFrame()
        for i in range(1, max_factors+1):
            temp_revised = pd.DataFrame(target_justifs[target_justifs["justif_key_numfields"] <= i])
            temp_revised["justif_key_numfields"] = [i for _ in temp_revised["justif_key_numfields"]]
            target_justifs_grouped = pd.concat([
                target_justifs_grouped,
                temp_revised,
            ], ignore_index=True)
        dots = plt.scatter(
            [plt_1predgap+(2*i)+(2*max_factors*n) for i in target_justifs_grouped["justif_key_numfields"]],
            target_justifs_grouped["ref_mean"],
            alpha=[(s/max(justif_df["ref_samplesize"]))**0.9 for s in target_justifs_grouped["ref_samplesize"]],
        )
        # confidence interval gap between most extreme justifications on both sides
        for i in range(1, max_factors+1):
            target_justifs_i = target_justifs[target_justifs["justif_key_numfields"] <= i]
            if len(target_justifs_i) == 0:
                continue
            range_lo = min(target_justifs_i["ref_conf_top"])
            range_hi = max(target_justifs_i["ref_conf_bot"])
            if range_lo <= range_hi:
                pos_x = plt_1predgap+(2*i)+(2*max_factors*n)+1
                _ = plt.plot(
                    [pos_x, pos_x],
                    [range_lo, range_hi],
                    color=dots.get_facecolors()[0][:-1],
                    marker="",
                )
    # population mean line
    breakdown = ref_df[DATASETB_EVIDENCE_COLNAME]
    src_range = proportion.proportion_confint(sum(breakdown), len(breakdown), alpha=DATASETB_CONF_ALPHA, method='beta')
    plt.axline((1, sum(breakdown)/len(breakdown)), slope=0, alpha=0.8)
    plt.fill_between(
        [0, 4*max_factors*num_eachside+plt_gapsize+2], 
        [src_range[1], src_range[1]], 
        [src_range[0], src_range[0]], 
        alpha=0.3
    )
    # 50% line
    plt.axline((1, 0.5), slope=0, alpha=0.4)
    # Modify axis visibility, add population comparison, add bin tags
    ax = plt.gca()
    ax.get_xaxis().set_visible(False)
    group_fontdict = {'size':'large'}
    ax.text(x=0, y=0.9, s='Predicted low-risk', fontdict=group_fontdict)
    ax.text(x=(2*max_factors*num_eachside)+plt_gapsize, y=0.9, s='Predicted high-risk', fontdict=group_fontdict)
    ax.set_ylabel('Recidivism rate (dots), confidence gap (lines)', fontdict=group_fontdict)
    ax.set_ylim(bottom=0, top=1)
    # restore graph size settings
    plt.rcParams["figure.figsize"] = temp_rcparams_figsize

vis_ratbounds(decisions_ref_df, decisions_evl_df, justif_df, justif_relevant_df, 10, DATASETB_MAX_FIELDS)

In [None]:
# show frequency of justification coverage (1pred/1anti/2both/0none)
def vis_coverage(evl_df, justif_df, justif_rel_df):
    results = {
        "1supp": 0,
        "1anti": 0,
        "2both": 0,
        "0none": 0,
    }
    for evl_id, evl_id_justifs in justif_rel_df.groupby(by=["evl_id"]):
        temp = evl_id_justifs.merge(
            justif_df,
            how="inner", on=["justif_key"], validate="1:1",
        )
        # calculate the average justification premise for applicable justifications to this case
        temp_support = sum(temp["justif_premise"])/len(temp["justif_premise"])
        if temp_support==sum(evl_df[evl_df[DATASETA_ID_COLNAME]==evl_id][DATASETA_DECISION_COLNAME]):
            results["1supp"] += 1
        elif temp_support==0 or temp_support==1:
            results["1anti"] += 1
        else:
            results["2both"] += 1
    # count up what cases don't have any usable for them at all
    results["0none"] = len(set(evl_df[DATASETA_ID_COLNAME]))-results["1supp"]-results["1anti"]-results["2both"]
    return results

# All cases
vis_coverage(decisions_evl_df, justif_df, justif_relevant_df)
# only p=0 cases
temp_decisions_df = decisions_evl_df[decisions_evl_df[DATASETA_DECISION_COLNAME]==0]
temp_relevant_df = justif_relevant_df[justif_relevant_df["evl_id"].isin(temp_decisions_df[DATASETA_ID_COLNAME])]
vis_coverage(temp_decisions_df, justif_df, temp_relevant_df)
# only p=1 cases
temp_decisions_df = decisions_evl_df[decisions_evl_df[DATASETA_DECISION_COLNAME]==1]
temp_relevant_df = justif_relevant_df[justif_relevant_df["evl_id"].isin(temp_decisions_df[DATASETA_ID_COLNAME])]
vis_coverage(temp_decisions_df, justif_df, temp_relevant_df)

<a id="anchor_justify"></a>
# Justify: what explanations were actually used for each case?

[Relevant metafields](#anchor_metafields_justify)

In [None]:
def justify_approach_minrefsize(evl_df, justif_df, justif_ref_df, must_agree=True):
    assigned_df = pd.DataFrame()
    for _, evl_row in evl_df.iterrows():
        evl_row_id = evl_row[DATASETA_ID_COLNAME]
        evl_row_premise = evl_row[DATASETA_DECISION_COLNAME]
        usable_je_df = justif_ref_df[justif_ref_df["evl_id"]==evl_row_id]
        usable_je_df = usable_je_df.merge(
            justif_df,
            how="inner", on=["justif_key"], validate="1:1",
        )
        agreed_usable_je_df = usable_je_df[usable_je_df["justif_premise"] == evl_row_premise]
        usable_je_df = usable_je_df.sort_values("ref_samplesize")
        agreed_usable_je_df = agreed_usable_je_df.sort_values("ref_samplesize")
        if len(agreed_usable_je_df)>0:
            assigned_df = assigned_df.append({
                "evl_id": evl_row_id,
                "justif_key": agreed_usable_je_df.iloc[0]["justif_key"],
            }, ignore_index=True)
        elif (not must_agree) and len(usable_je_df)>0:
            assigned_df = assigned_df.append({
                "evl_id": evl_row_id,
                "justif_key": usable_je_df.iloc[0]["justif_key"],
            }, ignore_index=True)
        else:
            assigned_df = assigned_df.append({
                "evl_id": evl_row_id,
                "justif_key": None,
            }, ignore_index=True)
    # Check that all cases were assigned some justification
    if len(evl_df)!=len(assigned_df):
        raise ValueError(f"missing justifications; evl={evl_df.shape}; assigned={assigned_df.shape}")
    return assigned_df

In [None]:
# Generate case justifications
justify_ref = {
    "MIN_REFSIZE_AGREE": lambda e, j, jr: justify_approach_minrefsize(e, j, jr, must_agree=True),
    "MIN_REFSIZE_ANY": lambda e, j, jr: justify_approach_minrefsize(e, j, jr, must_agree=False),
}
justif_assigned_df = justify_ref[JUSTIFY_APPROACH](decisions_evl_df, justif_df, justif_relevant_df)

justif_assigned_df[:5]

<a id="anchor_evaluate"></a>
# Evaluate: what are the faithfulness metrics for a given set of used justifications?

[Relevant metafields](anchor_metafields_evaluate)

In [None]:
decisions_evl_df[:2]
justif_df[:2]
justif_relevant_df[:2]
justif_assigned_df[:2]

In [None]:
# given a set of justifications used for each case in an evl_df,
# evaluate based on the metrics defined in (Dasgupta 2022) paper
# [https://arxiv.org/pdf/2202.00734.pdf]

In [None]:
# Local consistency: 
# based on a single input case, equals the probability that other cases with
# the same explanation assigned have the same decision output as the original
# case

# Global consistency:
# The expected local consistency across the entire input case distribution

# IGNORES ALL CASES WHERE NO JUSTIFICATION WAS USABLE

def evaluate_dasgupta_faithfulness_consistency(evl_df, justifs_used_df):
    score = 0
    # Filter out cases where no justification ended up being usable
    justifs_used_df = justifs_used_df[justifs_used_df["justif_key"].notnull()]
    # group by explanation, weight each group with the number of cases it handles
    for justif_key, justif_cases_df in justifs_used_df.groupby(by=["justif_key"], dropna=False):
        # calculate the average local consistency for each group
        direct_counts = justif_cases_df.merge(
            evl_df,
            how="inner", left_on=["evl_id"], right_on=[DATASETA_ID_COLNAME], validate="1:1",
        )[DATASETA_DECISION_COLNAME].value_counts(normalize=True)
        group_consistency = sum([e**2 for e in direct_counts])
        score += group_consistency*len(justif_cases_df)
    score = score / len(justifs_used_df)
    return score

In [None]:
# Local sufficiency:
# based on a single input case, equals the probability that other cases that
# the same explanation is applicable to have the same decision output as the
# original case

# Global sufficiency:
# The expected local sufficiency across the entire input case distribution

# IGNORES ALL CASES WHERE NO JUSTIFICATION WAS USABLE

def evaluate_dasgupta_faithfulness_sufficiency(evl_df, justifs_used_df, justifs_usable_df):
    score = 0
    # Filter out cases where no justification ended up being usable
    justifs_used_df = justifs_used_df[justifs_used_df["justif_key"].notnull()]
    # group by explanation, weight each group with the number of cases it handles
    for justif_key, justif_cases_df in justifs_used_df.groupby(by=["justif_key"], dropna=False):
        # calculate the average local sufficiency for each group
        direct_counts = justif_cases_df.merge(
            evl_df,
            how="inner", left_on=["evl_id"], right_on=[DATASETA_ID_COLNAME], validate="1:1",
        )[DATASETA_DECISION_COLNAME].value_counts(normalize=True)
        related_counts = justifs_usable_df[justifs_usable_df["justif_key"]==justif_key].merge(
            evl_df,
            how="inner", left_on=["evl_id"], right_on=[DATASETA_ID_COLNAME], validate="1:1",
        )[DATASETA_DECISION_COLNAME].value_counts(normalize=True)
        group_sufficiency = pd.concat([direct_counts, related_counts], axis=1).fillna(0)
        group_sufficiency = [[g[0], g[1]] for _, g in group_sufficiency.iterrows()]
        group_sufficiency = [l[0]*l[1] for l in group_sufficiency]
        group_sufficiency = sum(group_sufficiency)
        score += group_sufficiency*len(justif_cases_df)
    score = score / len(justifs_used_df)
    return score

In [None]:
# Uniqueness:
# the fraction of test cases whose explanations were unique

# IGNORES ALL CASES WHERE NO JUSTIFICATION WAS USABLE

def evaluate_dasgupta_faithfulness_uniqueness(evl_df, justifs_used_df):
    score = 0
    # Filter out cases where no justification ended up being usable
    justifs_used_df = justifs_used_df[justifs_used_df["justif_key"].notnull()]
    # Count how many cases of unique explanation there were
    score += (justifs_used_df["justif_key"].value_counts(dropna=True)==1).sum()
    score = score / len(justifs_used_df)
    return score

In [None]:
justif_assigned_df[justif_assigned_df["justif_key"].notnull()].shape[0]/justif_assigned_df.shape[0]

evaluate_dasgupta_faithfulness_consistency(decisions_evl_df, justif_assigned_df)
evaluate_dasgupta_faithfulness_sufficiency(decisions_evl_df, justif_assigned_df, justif_relevant_df)
evaluate_dasgupta_faithfulness_uniqueness(decisions_evl_df, justif_assigned_df)