In [1]:
import pandas as pd
from utils_error_analysis import *

# In-Domain

MAMI - Binary

In [2]:
#test labels
mami_test_df = pd.read_json("../data/MAMI/MAMI_test.json",orient='index')
y_test = mami_test_df['misogynous'].to_numpy()

#individual predictions
bin_label = "misogynous_prediction"
#model predictions
y_style_emo = pd.read_csv("../models/output/MAMI/svm_stylometric_emotion_MAMI_test_binary.csv")[bin_label].to_numpy()
y_roberta = pd.read_csv("../models/output/MAMI/roberta_MAMI_test_binary.csv")[bin_label].to_numpy()
y_multimodal = pd.read_csv("../models/output/MAMI/multimodal_roberta_swin_MAMI_test_binary.csv")[bin_label].to_numpy()
y_ensemble = pd.read_csv("../models/output/MAMI/ensemble_svm_roberta_robertaswin_MAMI_test_binary.csv")[bin_label].to_numpy()

In [3]:
mami_in_domain_preds_binary = {
    "SVM style emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(mami_in_domain_preds_binary,[])

best_mami_binary_indomain = {
    "RoBERTa":y_roberta,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_mami_binary_indomain,[])

Pearson r

SVM style emo   vs RoBERTa         | Pearson r: 0.52
------------------------------
SVM style emo   vs RoBERTa+Swin V2 | Pearson r: 0.44
------------------------------
RoBERTa         vs RoBERTa+Swin V2 | Pearson r: 0.58
------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa

McNemar Test:
McNemar Statistic: 1.27
p-value: 0.2606
Significant difference: no
------------------------------


In [4]:
model_names = ["SVM stle emo","RoBERTa","RoBERTa+Swin V2","Ensemble"]
predictions = [y_style_emo,y_roberta,y_multimodal,y_ensemble]

for model, prediction_list in zip (model_names,predictions):
    get_fpr_ppv(y_test,prediction_list,model,[])
    print("-"*30)

SVM stle emo - in-domain
FPR: 0.53
PPV: 0.62
------------------------------
RoBERTa - in-domain
FPR: 0.45
PPV: 0.66
------------------------------
RoBERTa+Swin V2 - in-domain
FPR: 0.57
PPV: 0.62
------------------------------
Ensemble - in-domain
FPR: 0.52
PPV: 0.64
------------------------------


MAMI - Multi-label

In [5]:
mami_label_names = ["misogynous","shaming","stereotype","objectification","violence"]
mami_test_df = pd.read_json("../data/MAMI/MAMI_test.json",orient='index')
y_test = mami_test_df[mami_label_names].to_numpy()

In [7]:
#model predictions
pred_labels = [f"{label}_prediction" for label in mami_label_names]
y_style_emo = pd.read_csv("../models/output/MAMI/roberta_first_step/svm_stylometric_emotion_w_roberta_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_roberta = pd.read_csv("../models/output/MAMI/roberta_first_step/roberta_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_multimodal = pd.read_csv("../models/output/MAMI/roberta_first_step/multimodal_roberta_swin_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_ensemble = pd.read_csv("../models/output/MAMI/roberta_first_step/ensemble_svm_roberta_robertaswin_hier_ens_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()

In [8]:
mami_in_domain_preds_multilabel = {
    "SVM style emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(mami_in_domain_preds_multilabel,mami_label_names)

best_mami_multilabel_indomain = {
    "RoBERTa+Swin V2":y_multimodal,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_mami_multilabel_indomain,mami_label_names)

Pearson r

SVM style emo vs RoBERTa
  Label shaming              | Pearson r: 0.37
  Label stereotype           | Pearson r: 0.56
  Label objectification      | Pearson r: 0.58
  Label violence             | Pearson r: 0.59
  Average all               | Pearson r: 0.52

------------------------------
SVM style emo vs RoBERTa+Swin V2
  Label shaming              | Pearson r: 0.30
  Label stereotype           | Pearson r: 0.54
  Label objectification      | Pearson r: 0.52
  Label violence             | Pearson r: 0.51
  Average all               | Pearson r: 0.47

------------------------------
RoBERTa vs RoBERTa+Swin V2
  Label shaming              | Pearson r: 0.53
  Label stereotype           | Pearson r: 0.65
  Label objectification      | Pearson r: 0.70
  Label violence             | Pearson r: 0.68
  Average all               | Pearson r: 0.64

------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa+Swin V2
  Label misogynous           | S

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))


EXIST 2024 - Binary

In [8]:
#test labels
exist_test_df = pd.read_json("../data/EXIST2024/EXIST2024_test.json",orient='index')
y_test = exist_test_df['sexist'].to_numpy()

In [9]:
bin_label = "sexist_prediction"
#model predictions
y_style_emo = pd.read_csv("../models/output/EXIST2024/svm_stylometric_emotion_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_roberta = pd.read_csv("../models/output/EXIST2024/roberta_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_multimodal = pd.read_csv("../models/output/EXIST2024/multimodal_roberta_swin_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_ensemble = pd.read_csv("../models/output/EXIST2024/ensemble_svm_roberta_robertaswin_EXIST2024_test_binary.csv")[bin_label].to_numpy()

In [10]:
exist_in_domain_preds_binary = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(exist_in_domain_preds_binary,[])

best_exist_binary_indomain = {
    "RoBERTa+Swin V2":y_multimodal,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_exist_binary_indomain,[])

Pearson r

SVM stle emo    vs RoBERTa         | Pearson r: 0.40
------------------------------
SVM stle emo    vs RoBERTa+Swin V2 | Pearson r: 0.30
------------------------------
RoBERTa         vs RoBERTa+Swin V2 | Pearson r: 0.68
------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa+Swin V2

McNemar Test:
McNemar Statistic: 0.06
p-value: 0.8137
Significant difference: no
------------------------------


EXIST 2024 - Multi-label

In [11]:
#test labels
exist_label_names = ["sexist","ideological-inequality","stereotyping-dominance","objectification", "sexual-violence", "misogyny-non-sexual-violence"]
exist_test_df = pd.read_json("../data/EXIST2024/EXIST2024_test.json",orient='index')
y_test = exist_test_df[exist_label_names].to_numpy()

In [12]:
#model predictions
pred_labels = [f"{label}_prediction" for label in exist_label_names]
y_style_emo = pd.read_csv("../models/output/EXIST2024/ensemble_first_step/svm_stylometric_emotion_w_ensemble_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_roberta = pd.read_csv("../models/output/EXIST2024/ensemble_first_step/roberta_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_multimodal = pd.read_csv("../models/output/EXIST2024/ensemble_first_step/multimodal_roberta_swin_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_ensemble = pd.read_csv("../models/output/EXIST2024/ensemble_first_step/ensemble_svm_roberta_robertaswin_hier_ens_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()


In [13]:
exist_in_domain_preds_multilabel = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(exist_in_domain_preds_multilabel,exist_label_names)

best_exist_multilabel_indomain = {
    "RoBERTa":y_roberta,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_exist_multilabel_indomain,exist_label_names)

Pearson r

SVM stle emo vs RoBERTa
  Label ideological-inequality | Pearson r: 0.73
  Label stereotyping-dominance | Pearson r: 0.42
  Label objectification      | Pearson r: 0.63
  Label sexual-violence      | Pearson r: 0.37
  Label misogyny-non-sexual-violence | Pearson r: 0.32
  Average all               | Pearson r: 0.49

------------------------------
SVM stle emo vs RoBERTa+Swin V2
  Label ideological-inequality | Pearson r: 0.72
  Label stereotyping-dominance | Pearson r: 0.49
  Label objectification      | Pearson r: 0.68
  Label sexual-violence      | Pearson r: 0.27
  Label misogyny-non-sexual-violence | Pearson r: 0.50
  Average all               | Pearson r: 0.53

------------------------------
RoBERTa vs RoBERTa+Swin V2
  Label ideological-inequality | Pearson r: 0.77
  Label stereotyping-dominance | Pearson r: 0.56
  Label objectification      | Pearson r: 0.71
  Label sexual-violence      | Pearson r: 0.65
  Label misogyny-non-sexual-violence | Pearson r: 0.26
  Average

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))


# Cross-Dataset

MAMI - Binary

In [14]:
#test labels
mami_test_df = pd.read_json("../data/overlapping_classes/MAMI/MAMI_test.json",orient='index')
y_test = mami_test_df['misogynous'].to_numpy()

In [15]:
bin_label = "misogynous_prediction"
#model predictions
y_style_emo = pd.read_csv("../models/output/cross_dataset/MAMI/svm_stylometric_emotion_MAMI_test_binary.csv")[bin_label].to_numpy()
y_roberta = pd.read_csv("../models/output/cross_dataset/MAMI/roberta_MAMI_test_binary.csv")[bin_label].to_numpy()
y_multimodal = pd.read_csv("../models/output/cross_dataset/MAMI/multimodal_roberta_swin_MAMI_test_binary.csv")[bin_label].to_numpy()
y_ensemble = pd.read_csv("../models/output/cross_dataset/MAMI/ensemble_svm_roberta_robertaswin_MAMI_test_binary.csv")[bin_label].to_numpy()

In [16]:
mami_crossdataset_preds_binary = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(mami_crossdataset_preds_binary,[])

best_mami_binary_crossdataset = {
    "RoBERTa+Swin V2":y_multimodal,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_mami_binary_crossdataset,[])

Pearson r

SVM stle emo    vs RoBERTa         | Pearson r: 0.36
------------------------------
SVM stle emo    vs RoBERTa+Swin V2 | Pearson r: 0.34
------------------------------
RoBERTa         vs RoBERTa+Swin V2 | Pearson r: 0.54
------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa+Swin V2

McNemar Test:
McNemar Statistic: 0.04
p-value: 0.8494
Significant difference: no
------------------------------


MAMI - Multi-label

In [17]:
#test labels
mami_label_names = ["misogynous","stereotype","objectification","violence"]
mami_test_df = pd.read_json("../data/overlapping_classes/MAMI/MAMI_test.json",orient='index')
y_test = mami_test_df[mami_label_names].to_numpy()

In [18]:
#model predictions
pred_labels = [f"{label}_prediction" for label in mami_label_names]
y_style_emo = pd.read_csv("../models/output/cross_dataset/MAMI/ensemble_first_step/svm_stylometric_emotion_w_ensemble_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_roberta = pd.read_csv("../models/output/cross_dataset/MAMI/ensemble_first_step/roberta_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_multimodal = pd.read_csv("../models/output/cross_dataset/MAMI/ensemble_first_step/multimodal_roberta_swin_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()
y_ensemble = pd.read_csv("../models/output/cross_dataset/MAMI/ensemble_first_step/ensemble_svm_roberta_robertaswin_hier_ens_MAMI_test_hierarchical.csv")[pred_labels].to_numpy()

In [19]:
mami_crossdataset_preds_multilabel = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(mami_crossdataset_preds_multilabel,mami_label_names)

best_mami_multilabel_crossdataset = {
     "RoBERTa+Swin V2":y_multimodal,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_mami_multilabel_crossdataset,mami_label_names)

Pearson r

SVM stle emo vs RoBERTa
  Label stereotype           | Pearson r: 0.59
  Label objectification      | Pearson r: 0.61
  Label violence             | Pearson r: 0.21
  Average all               | Pearson r: 0.47

------------------------------
SVM stle emo vs RoBERTa+Swin V2
  Label stereotype           | Pearson r: 0.53
  Label objectification      | Pearson r: 0.59
  Label violence             | Pearson r: 0.33
  Average all               | Pearson r: 0.48

------------------------------
RoBERTa vs RoBERTa+Swin V2
  Label stereotype           | Pearson r: 0.57
  Label objectification      | Pearson r: 0.71
  Label violence             | Pearson r: 0.35
  Average all               | Pearson r: 0.54

------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa+Swin V2
  Label misogynous           | Stat: inf   | p-value: 0.0000 | Significant difference: yes
  Label stereotype           | Stat: 2.19  | p-value: 0.1391 | Significant differenc

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))


EXIST - Binary

In [20]:
#test labels
exist_test_df = pd.read_json("../data/overlapping_classes/EXIST2024/EXIST2024_test.json",orient='index')
y_test = exist_test_df['sexist'].to_numpy()

In [21]:
bin_label = "sexist_prediction"
#model predictions
y_style_emo = pd.read_csv("../models/output/cross_dataset/EXIST2024/svm_stylometric_emotion_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_roberta = pd.read_csv("../models/output/cross_dataset/EXIST2024/roberta_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_multimodal = pd.read_csv("../models/output/cross_dataset/EXIST2024/multimodal_roberta_swin_EXIST2024_test_binary.csv")[bin_label].to_numpy()
y_ensemble = pd.read_csv("../models/output/cross_dataset/EXIST2024/ensemble_svm_roberta_robertaswin_EXIST2024_test_binary.csv")[bin_label].to_numpy()

In [22]:
exist_crossdataset_preds_binary = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(exist_crossdataset_preds_binary,[])

best_exist_binary_crossdataset = {
    "RoBERTa":y_roberta,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_exist_binary_crossdataset,[])

Pearson r

SVM stle emo    vs RoBERTa         | Pearson r: 0.39
------------------------------
SVM stle emo    vs RoBERTa+Swin V2 | Pearson r: 0.27
------------------------------
RoBERTa         vs RoBERTa+Swin V2 | Pearson r: 0.61
------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa

McNemar Test:
McNemar Statistic: 0.10
p-value: 0.7518
Significant difference: no
------------------------------


EXIST - Multi-label

In [23]:
#test labels
exist_label_names = ["sexist","stereotyping-dominance","objectification", "sexual-violence"]
exist_test_df = pd.read_json("../data/overlapping_classes/EXIST2024/EXIST2024_test.json",orient='index')
y_test = exist_test_df[exist_label_names].to_numpy()

In [24]:
#model predictions
pred_labels = [f"{label}_prediction" for label in exist_label_names]
y_style_emo = pd.read_csv("../models/output/cross_dataset/EXIST2024/ensemble_first_step/svm_stylometric_emotion_w_ensemble_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_roberta = pd.read_csv("../models/output/cross_dataset/EXIST2024/ensemble_first_step/roberta_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_multimodal = pd.read_csv("../models/output/cross_dataset/EXIST2024/ensemble_first_step/multimodal_roberta_swin_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()
y_ensemble = pd.read_csv("../models/output/cross_dataset/EXIST2024/ensemble_first_step/ensemble_svm_roberta_robertaswin_hier_ens_EXIST2024_test_hierarchical.csv")[pred_labels].to_numpy()


In [25]:
exist_crossdataset_preds_multilabel = {
    "SVM stle emo":y_style_emo,
    "RoBERTa":y_roberta,
    "RoBERTa+Swin V2":y_multimodal
}
print("Pearson r\n")
get_pearson_correlations(exist_crossdataset_preds_multilabel,exist_label_names)

best_exist_multilabel_crossdataset = {
    "RoBERTa":y_roberta,
    "Ensemble":y_ensemble
}
print("-"*30,"\nMcNemar test\n")
get_mcnemar_significance(y_test,best_exist_multilabel_crossdataset,exist_label_names)

Pearson r

SVM stle emo vs RoBERTa
  Label stereotyping-dominance | Pearson r: 0.69
  Label objectification      | Pearson r: 0.64
  Label sexual-violence      | Pearson r: 0.36
  Average all               | Pearson r: 0.56

------------------------------
SVM stle emo vs RoBERTa+Swin V2
  Label stereotyping-dominance | Pearson r: 0.72
  Label objectification      | Pearson r: 0.41
  Label sexual-violence      | Pearson r: 0.34
  Average all               | Pearson r: 0.49

------------------------------
RoBERTa vs RoBERTa+Swin V2
  Label stereotyping-dominance | Pearson r: 0.84
  Label objectification      | Pearson r: 0.60
  Label sexual-violence      | Pearson r: 0.53
  Average all               | Pearson r: 0.66

------------------------------
------------------------------ 
McNemar test

Ensemble vs RoBERTa
  Label sexist               | Stat: inf   | p-value: 0.0000 | Significant difference: yes
  Label stereotyping-dominance | Stat: 0.57  | p-value: 0.4497 | Significant differenc

  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
