In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from ferret import Benchmark
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

name = "tomh/toxigen_roberta"
model = AutoModelForSequenceClassification.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)

#model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import pipeline
toxigen_roberta = pipeline("text-classification", model=model, tokenizer=tokenizer, max_length=512, truncation=True)

In [7]:
bench = Benchmark(model, tokenizer)
sentence = "most Asians have a small penis or none at all"

toxigen_roberta(sentence)

[{'label': 'LABEL_1', 'score': 0.991439938545227}]

In [8]:
# Pretty-print feature attribution scores by all supported explainers
explanations = bench.explain(sentence)
evaluations = bench.evaluate_explanations(explanations, target=1)

bench.show_table(explanations)

                                                               

Token,most,ĠAsians,Ġhave,Ġa,Ġsmall,Ġpenis,Ġor,Ġnone,Ġat,Ġall
Partition SHAP,-0.09,0.14,-0.02,0.05,0.02,0.67,0.0,0.0,0.0,0.0
LIME,0.05,0.13,-0.05,-0.01,-0.04,0.63,-0.02,0.04,-0.0,-0.02
Gradient,0.07,0.1,0.05,0.06,0.14,0.27,0.06,0.07,0.04,0.04
Gradient (x Input),-0.17,-0.11,-0.02,0.03,0.14,-0.35,-0.03,-0.01,0.04,-0.06
Integrated Gradient,-0.03,0.01,0.06,-0.03,-0.05,-0.33,-0.06,0.01,-0.13,-0.27
Integrated Gradient (x Input),0.02,0.2,0.06,0.02,0.03,0.42,0.09,-0.03,0.02,0.11


In [3]:
bench.show_evaluation_table(evaluations)

Unnamed: 0,aopc_compr,aopc_suff,taucorr_loo
Partition SHAP,0.89,0.13,0.11
LIME,0.83,0.19,0.33
Gradient,0.85,0.1,0.29
Gradient (x Input),0.0,0.98,-0.69
Integrated Gradient,0.08,0.94,0.07
Integrated Gradient (x Input),0.9,0.11,0.47


In [9]:
sentence = "most <mask> have a small penis or none at all"
toxigen_roberta(sentence)

[{'label': 'LABEL_1', 'score': 0.9908005595207214}]

In [12]:
# Pretty-print feature attribution scores by all supported explainers
explanations = bench.explain(sentence)
bench.show_table(explanations)

                                                        

Token,most,Unnamed: 2,Ġhave,Ġa,Ġsmall,Ġpenis,Ġor,Ġnone,Ġat,Ġall
Partition SHAP,0.04,0.01,0.01,0.0,-0.01,0.9,-0.02,0.0,0.0,0.0
LIME,0.02,-0.01,0.01,0.0,0.01,0.91,-0.0,-0.02,0.01,0.01
Gradient,0.12,0.14,0.07,0.05,0.1,0.24,0.06,0.06,0.04,0.04
Gradient (x Input),0.07,-0.11,-0.09,-0.03,0.16,-0.25,0.05,-0.01,0.05,-0.09
Integrated Gradient,-0.03,-0.18,-0.03,-0.06,0.1,-0.4,0.0,-0.0,-0.12,0.05
Integrated Gradient (x Input),0.06,0.01,-0.03,0.02,-0.01,0.65,0.03,0.12,0.03,-0.04


In [10]:
sentence = "most Asians have a small <mask> or none at all"
toxigen_roberta(sentence)

[{'label': 'LABEL_0', 'score': 0.8354395031929016}]

In [13]:
# Pretty-print feature attribution scores by all supported explainers
explanations = bench.explain(sentence)
bench.show_table(explanations)

                                                        

Token,most,ĠAsians,Ġhave,Ġa,Ġsmall,Unnamed: 6,Ġor,Ġnone,Ġat,Ġall
Partition SHAP,-0.21,0.37,-0.07,0.1,0.03,-0.07,-0.04,0.0,0.08,0.03
LIME,0.06,0.28,-0.22,-0.04,-0.1,-0.05,-0.03,0.01,0.12,0.1
Gradient,0.11,0.19,0.06,0.05,0.1,0.14,0.07,0.09,0.05,0.05
Gradient (x Input),-0.45,0.03,-0.11,-0.03,0.18,-0.02,0.0,-0.08,-0.01,-0.04
Integrated Gradient,-0.03,-0.12,0.13,0.08,-0.24,0.11,0.0,-0.04,0.0,0.05
Integrated Gradient (x Input),0.08,0.37,0.02,-0.05,0.14,-0.01,0.2,-0.06,0.04,-0.03
