In [39]:
from hir.util import create_analysis_dataframe, print_row

df_glove = create_analysis_dataframe("all_samples_glove.json")
df_roberta = create_analysis_dataframe("all_samples_roberta.json")

conf_threshold = .3
IOU_NA_LOWER_THRESHOLD = 0.3
R_L_THRESHOLD = 0.08
IOU_HIGH_THRESHOLD = 0.9

In [40]:
from hir.util import get_confidence

for df in [df_roberta, df_glove]:

    ignore_sec_max = False
    df["conf_hda"] = get_confidence(df,
                                    prob_column="hda_pred_all_probs",
                                    prediction_label_column="hda_label",
                                    ignore_sec_max=ignore_sec_max)

    df["conf_full"] = get_confidence(df,
                                     prob_column="full_pred_all_probs",
                                     prediction_label_column="full_pred_label",
                                     ignore_sec_max=ignore_sec_max)

    df["conf_mda"] = get_confidence(df,
                                    prob_column="mda_pred_all_probs",
                                    prediction_label_column="mda_label",
                                    ignore_sec_max=ignore_sec_max)

In [41]:
# Characteristic 1

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_pattern_known = df.query(f"hda_label == gold_label and conf_hda > {conf_threshold} and "
                                     f"r_l_mda != 0")

    print(f"The model knows the correct pattern for: {len(correct_pattern_known)} samples.\n"
          f"This are {(len(correct_pattern_known) / len(df)):.2f} of all samples.\n")

    similar_hir_mir = correct_pattern_known.query(f"iou_full > {IOU_NA_LOWER_THRESHOLD}")

    print(
        f"For: {len(similar_hir_mir)} samples or {(len(similar_hir_mir) / len(correct_pattern_known)):.2f} "
        f"of correct_pattern_known,\n"
        f"MIR and HIR are also similar.\n")

    case_1 = similar_hir_mir.query(f"full_pred_label == gold_label and conf_full > {conf_threshold}")
    print(
        f"For {len(case_1)} or {(len(case_1) / len(similar_hir_mir)):.2f} of these samples,\n"
        f"The model makes also the correct and confident overall prediction.\n")

    print(f"--> The Model knows the correct pattern and recognized it in\n"
          f"Overall {(len(case_1) / len(df))} of the cases!")

>>>> roberta <<<<<< 


The model knows the correct pattern for: 1321 samples.
This are 0.49 of all samples.

For: 1290 samples or 0.98 of correct_pattern_known,
MIR and HIR are also similar.

For 1205 or 0.93 of these samples,
The model makes also the correct and confident overall prediction.

--> The Model knows the correct pattern and recognized it in
Overall 0.44350386455649615 of the cases!
>>>> glove <<<<<< 


The model knows the correct pattern for: 892 samples.
This are 0.33 of all samples.

For: 861 samples or 0.97 of correct_pattern_known,
MIR and HIR are also similar.

For 750 or 0.87 of these samples,
The model makes also the correct and confident overall prediction.

--> The Model knows the correct pattern and recognized it in
Overall 0.2760397497239602 of the cases!


In [42]:
# Characteristic 2

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_pattern_known = df.query(f"hda_label == gold_label and conf_hda > {conf_threshold}")

    print(f"The model knows the correct pattern for: {len(correct_pattern_known)} samples.\n"
          f"This are {(len(correct_pattern_known) / len(df)):.2f} of all samples.\n")

    different_pattern_detected = correct_pattern_known.query(f"iou_full <= {IOU_NA_LOWER_THRESHOLD}")

    print(
        f"For: {len(different_pattern_detected)} samples "
        f"or {(len(different_pattern_detected) / len(correct_pattern_known)):.2f} "
        f"of correct_pattern_known,\n"
        f"MIR does not match the HIR, it cannot detect the right pattern\n")

    wrong_pattern_recognized = different_pattern_detected.query(f"full_pred_label != gold_label")

    print(
        f"For {len(wrong_pattern_recognized)} samples "
        f"or {(len(wrong_pattern_recognized) / len(different_pattern_detected)):.2f} of these samples,\n"
        f"The model makes the wrong prediction based on a learned pattern.\n")

    print("--> The Model knows the correct pattern but recognizes another one \n"
          f"overall {(len(wrong_pattern_recognized) / len(df))} of the cases!")

>>>> roberta <<<<<< 


The model knows the correct pattern for: 1972 samples.
This are 0.73 of all samples.

For: 34 samples or 0.02 of correct_pattern_known,
MIR does not match the HIR, it cannot detect the right pattern

For 8 samples or 0.24 of these samples,
The model makes the wrong prediction based on a learned pattern.

--> The Model knows the correct pattern but recognizes another one 
overall 0.002944423997055576 of the cases!
>>>> glove <<<<<< 


The model knows the correct pattern for: 1814 samples.
This are 0.67 of all samples.

For: 33 samples or 0.02 of correct_pattern_known,
MIR does not match the HIR, it cannot detect the right pattern

For 28 samples or 0.85 of these samples,
The model makes the wrong prediction based on a learned pattern.

--> The Model knows the correct pattern but recognizes another one 
overall 0.010305483989694516 of the cases!


In [43]:
# Characteristic 3

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_pattern_not_known = df.query(f"hda_label != gold_label")

    print(f"The model does not know the correct pattern for: {len(correct_pattern_not_known)} samples.\n"
          f"This are {(len(correct_pattern_not_known) / len(df))} of all samples.\n")

    confused = correct_pattern_not_known.query(f"iou_full >= {IOU_HIGH_THRESHOLD}")
    print(
        f"For {len(confused)} or {(len(confused) / len(correct_pattern_not_known)):.2f} of these samples,\n"
        f"The model uses the same pattern as human, but makes wrong classification.\n")

    print(f"--> The Model does not knows the correct pattern and is confused at labelling\n"
          f"Overall {(len(confused) / len(df))} of the cases!")

>>>> roberta <<<<<< 


The model does not know the correct pattern for: 667 samples.
This are 0.24549135075450865 of all samples.

For 67 or 0.10 of these samples,
The model uses the same pattern as human, but makes wrong classification.

--> The Model does not knows the correct pattern and is confused at labelling
Overall 0.02465955097534045 of the cases!
>>>> glove <<<<<< 


The model does not know the correct pattern for: 799 samples.
This are 0.29407434670592564 of all samples.

For 68 or 0.09 of these samples,
The model uses the same pattern as human, but makes wrong classification.

--> The Model does not knows the correct pattern and is confused at labelling
Overall 0.025027603974972397 of the cases!


In [44]:
# Characteristic 4

def custom_print(data_frame, prefix=""):
    print(f"{prefix}Overall {len(data_frame)} -> {(len(data_frame) / len(df)):.2f} of the cases!")

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    shortcuts = df.query(f"r_l_mda == 0")
    reasonable_shortcuts = shortcuts.query(f"iou_na == 1")
    custom_print(reasonable_shortcuts)
    custom_print(reasonable_shortcuts.query("full_pred_label == gold_label"), prefix="Correct: ")
    custom_print(reasonable_shortcuts.query("full_pred_label != gold_label"), prefix="Wrong  : ")
    custom_print(reasonable_shortcuts.query(
        f"hda_label == gold_label and conf_hda > {conf_threshold} and "
        f"full_pred_label == gold_label and conf_full > {conf_threshold}"
        ),
        prefix="Test  : ")

>>>> roberta <<<<<< 


Overall 95 -> 0.03 of the cases!
Correct: Overall 84 -> 0.03 of the cases!
Wrong  : Overall 11 -> 0.00 of the cases!
Test  : Overall 77 -> 0.03 of the cases!
>>>> glove <<<<<< 


Overall 112 -> 0.04 of the cases!
Correct: Overall 94 -> 0.03 of the cases!
Wrong  : Overall 18 -> 0.01 of the cases!
Test  : Overall 89 -> 0.03 of the cases!


In [45]:
# Characteristic 5

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_pattern_not_known = df.query(f"hda_label != gold_label")

    print(f"The model does not know the correct pattern for: {len(correct_pattern_not_known)} samples.\n"
          f"This are {(len(correct_pattern_not_known) / len(df)):.2f} of all samples.\n")

    different_to_hir = correct_pattern_not_known.query(f"iou_full < {IOU_NA_LOWER_THRESHOLD}")

    print(
        f"For: {len(different_to_hir)} samples or {(len(different_to_hir) / len(correct_pattern_not_known)):.2f} of correct_pattern_not_known,\n"
        f"MIR and HIR are different.\n")

    high_overall_confidence = different_to_hir.query(f"conf_mda > {conf_threshold} and r_l_mda < 0.5")
    print(
        f"For {len(high_overall_confidence)} or {(len(high_overall_confidence) / len(different_to_hir)):.2f} of these samples,\n"
        f"The model is confident on its prediction based on the wrong pattern.\n")

    print(f"--> The Model does not knows the correct pattern and uses different one for prediction\n"
          f"Overall {(len(high_overall_confidence) / len(df))} of the cases!")

>>>> roberta <<<<<< 


The model does not know the correct pattern for: 667 samples.
This are 0.25 of all samples.

For: 39 samples or 0.06 of correct_pattern_not_known,
MIR and HIR are different.

For 11 or 0.28 of these samples,
The model is confident on its prediction based on the wrong pattern.

--> The Model does not knows the correct pattern and uses different one for prediction
Overall 0.004048582995951417 of the cases!
>>>> glove <<<<<< 


The model does not know the correct pattern for: 799 samples.
This are 0.29 of all samples.

For: 40 samples or 0.05 of correct_pattern_not_known,
MIR and HIR are different.

For 8 or 0.20 of these samples,
The model is confident on its prediction based on the wrong pattern.

--> The Model does not knows the correct pattern and uses different one for prediction
Overall 0.002944423997055576 of the cases!


In [46]:
# Characteristic 6
for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_pattern_not_known = df.query(f"hda_label != gold_label")

    print(f"The model does not know the correct pattern for: {len(correct_pattern_not_known)} samples.\n"
          f"This are {(len(correct_pattern_not_known) / len(df)):.2f} of all samples.\n")

    # Model does not use a pattern / it does not set any focus -> it used almost the whole sentence as input
    no_focus = correct_pattern_not_known.query(f"r_l_mda >= {R_L_THRESHOLD}")
    print(
        f"For {len(no_focus)} or {(len(no_focus) / len(correct_pattern_not_known)):.2f} of these samples,\n"
        f"The model has no focus points.\n")

    print(f"-->The Model does not knows the correct pattern and does "
          f"not use any other detectable pattern\n"
          f"Overall {(len(no_focus) / len(df)):.2f} of the cases!")
    print()


>>>> roberta <<<<<< 


The model does not know the correct pattern for: 667 samples.
This are 0.25 of all samples.

For 305 or 0.46 of these samples,
The model has no focus points.

-->The Model does not knows the correct pattern and does not use any other detectable pattern
Overall 0.11 of the cases!

>>>> glove <<<<<< 


The model does not know the correct pattern for: 799 samples.
This are 0.29 of all samples.

For 271 or 0.34 of these samples,
The model has no focus points.

-->The Model does not knows the correct pattern and does not use any other detectable pattern
Overall 0.10 of the cases!



In [47]:
# Characteristic 7

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    correct_with_wrong_pattern = df.query(f"full_pred_label == gold_label and "
                                          f"iou_full < {IOU_NA_LOWER_THRESHOLD}")
    print(f"Overall {len(correct_with_wrong_pattern)} "
          f"-> {(len(correct_with_wrong_pattern) / len(df))} of the cases!")


>>>> roberta <<<<<< 


Overall 42 -> 0.015458225984541774 of the cases!
>>>> glove <<<<<< 


Overall 24 -> 0.008833271991166729 of the cases!


In [48]:
# Characteristic 8

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")

    shortcuts = df.query(f"r_l_mda == 0")
    not_reasonable_shortcuts = shortcuts.query(f"iou_na < 1")
    custom_print(not_reasonable_shortcuts)
    custom_print(not_reasonable_shortcuts.query("full_pred_label == gold_label"), prefix="Correct: ")
    custom_print(not_reasonable_shortcuts.query("full_pred_label != gold_label"), prefix="Wrong  : ")

>>>> roberta <<<<<< 


Overall 725 -> 0.27 of the cases!
Correct: Overall 619 -> 0.23 of the cases!
Wrong  : Overall 106 -> 0.04 of the cases!
>>>> glove <<<<<< 


Overall 1114 -> 0.41 of the cases!
Correct: Overall 888 -> 0.33 of the cases!
Wrong  : Overall 226 -> 0.08 of the cases!


In [49]:
# F1 Scores

for df, name in zip([df_roberta, df_glove], ["roberta", "glove"]):

    print(f">>>> {name} <<<<<< \n\n")
    from sklearn.metrics import f1_score
    allowed_labels = list(set(df["gold_label"].values.tolist()) - {"Other"})
    f1_full = f1_score(df["gold_label"], df["full_pred_label"], average="micro", labels=allowed_labels)
    f1_hda = f1_score(df["gold_label"], df["hda_label"], average="micro", labels=allowed_labels)
    #assert f1_full == f1_mda
    print(f"F1_full={f1_full}")
    print(f"F1_hda={f1_hda}")

>>>> roberta <<<<<< 


F1_full=0.8710801393728224
F1_hda=0.7995575221238939
>>>> glove <<<<<< 


F1_full=0.7948324939785416
F1_hda=0.7573620904189133


In [51]:
# KDA Plot
import numpy as np
import plotly
import plotly.figure_factory as ff

df_glove = create_analysis_dataframe("all_samples_glove.json")
df_roberta = create_analysis_dataframe("all_samples_roberta.json")
dfs= [df_glove, df_roberta]

for cur_df in dfs:
    cur_df["conf_hda"] = get_confidence(cur_df,
                                prob_column="hda_pred_all_probs",
                                prediction_label_column="hda_label",
                                ignore_sec_max=ignore_sec_max)

    cur_df["conf_full"] = get_confidence(cur_df,
                                 prob_column="full_pred_all_probs",
                                 prediction_label_column="full_pred_label",
                                 ignore_sec_max=ignore_sec_max)

    cur_df["conf_mda"] = get_confidence(cur_df,
                                prob_column="mda_pred_all_probs",
                                prediction_label_column="mda_label",
                                ignore_sec_max=ignore_sec_max)

def create_kde_plot(df):
    hist_data = [df["conf_full"].to_numpy(),df["conf_hda"].to_numpy(), df["conf_mda"].to_numpy()]
    hist_data = [np.clip(a, 0, 1) for a in hist_data]

    group_labels=["Full","HIR", "MIR"]

    colors = ['rgb(117,131,255)', 'rgb(65,196,94)', 'rgb(214,69,51)']
    fig= ff.create_distplot(hist_data, group_labels, bin_size=.1, show_rug=False, show_hist=False, colors=colors)
    fig.update_xaxes(title="Model Confidence", range=[0,1], gridwidth=1, gridcolor="darkgray", zerolinecolor="black", title_standoff=1)
    fig.update_yaxes(title="Density", gridwidth=1, gridcolor="darkgray", zerolinecolor="black", title_standoff=10)

    fig.update_traces(line=dict(width=3))
    fig.update_layout(width=300, height=220,
                      template="plotly_white",
                      font= dict(family="Times Roman",
                                 color="black"
                                 ),
                      legend=dict(
                          traceorder="grouped",
                          title="Prediction: ",
                          bgcolor="White",
                          orientation="h",
                          yanchor="bottom",
                          y=1.02,
                          xanchor="left",
                          x=0
                      ),
                      margin=dict(
                          l=0,
                          r=0,
                          b=0,
                          t=0
                      )
                      )

    #fig.update_layout(barmode='stack')
    fig.update_traces(
        line=dict(dash="dot", width=3),
        selector=dict(type="scatter", mode="lines", name="Full")
    )
    fig.update_traces(
        line=dict(dash="dashdot", width=3),
        selector=dict(type="scatter", mode="lines", name="HIR")
    )
    fig.update_traces(
        line=dict(dash="dash", width=3),
        selector=dict(type="scatter", mode="lines", name="MIR")
    )
    fig.show()
