# Test biaslyze with the toxic comments dataset

Data source: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/tobias/Repositories/biaslyze/')

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

## Load and prepare data

In [4]:
df = pd.read_csv("../data/jigsaw-toxic-comment-classification/train.csv"); df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
# make the classification problem binary
df["target"] = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis=1) > 0

## Train a BoW-model

In [6]:
clf = make_pipeline(TfidfVectorizer(min_df=10, max_features=10000, stop_words="english"), LogisticRegression())

In [7]:
clf.fit(df.comment_text, df.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
train_pred = clf.predict(df.comment_text)
print(accuracy_score(df.target, train_pred))

0.9605755431751384


## Test LIME based bias detection with keywords

In [9]:
from biaslyze.evaluators import LimeBiasEvaluator
from biaslyze.bias_detectors import LimeKeywordBiasDetector

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
bias_detector = LimeKeywordBiasDetector(
    bias_evaluator=LimeBiasEvaluator(n_lime_samples=500),
    n_top_keywords=20,
    use_tokenizer=True
)

In [11]:
test_texts = df.comment_text.sample(200)
detection_res = bias_detector.detect(texts=test_texts, predict_func=clf.predict_proba)

2023-04-13 12:04:23.764 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 200 texts...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 46.79it/s]
2023-04-13 12:04:28.049 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 35 texts with protected concepts.
2023-04-13 12:04:28.050 | INFO     | biaslyze.evaluators:evaluate:42 - Started bias detection on 35 samples...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:13<00:00,  2.67it/s]


In [12]:
detection_res.summary()

Detected 9 samples with potential issues.
    Potentially problematic concepts detected: [('gender', 7), ('religion', 1), ('nationality', 1)]
    Based on keywords: [('his', 4), ('he', 3), ('son', 1), ('christ', 1), ('english', 1), ('her', 1)].


In [13]:
detection_res.details(group_by_concept=True)

Concept: gender
[{'reason': ['his', 'he'],
  'text': '"\n'
          '\n'
          ' Unreliable sources \n'
          '\n'
          'Most of this article is sourced to personal webpages aggrandizing '
          'the subject, like this homage page by one of his cousins, a '
          'fan-made old website for user-submitted biographies and a personal '
          'website by the subject himself. \n'
          '\n'
          'Other than this, one can still find information that goes unsourced '
          'in the article (not even sourced to a self-published blog or the '
          'like), like the statements that the subject ""has authored over 150 '
          'articles"", that he is a ""commercial pilot"" and that he is an '
          '""certified Spanish-English diplomatic translator-interpreter"" '
          '(whatever that may be).\n'
          '\n'
          'I have tagged these problems and I wish they could be solved.  "'},
 {'reason': ['his'],
  'text': '"The ""defamatory conten

In [14]:
test_df = df.iloc[test_texts.index]

In [15]:
test_df[test_df.target].reset_index().comment_text[4]

"I'm ganna give ya a day ta think about weather or not ya wanna deal wit me, and believe you me homie I know all bout harassment and I ain't ganna stop on this mutha fuckin syte, if i'm not unblocked at 3:00 I'm ganna hack your shyt, harras you and basicly just fuck you around, so think about it ok NiggeR?"

In [17]:
from bokeh.io import show, output_notebook

output_notebook()

dashboard = detection_res.dashboard()
show(dashboard)

In [None]:
show(dashboard)

## Testing a sentiment analysis model from huggingface

In [18]:
from transformers import pipeline
from torch.utils.data import Dataset


classifier = pipeline(
    model="distilbert-base-uncased-finetuned-sst-2-english",
    top_k=None,
    padding=True,
    truncation=True
)

In [19]:
class MyDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]


def predict_sentiment(texts):
    data = MyDataset(texts)
    proba = []
    for res in classifier(data):
        proba_array = []
        for p in sorted(res, key=lambda d: d['label'], reverse=True):
            proba_array.append(p.get("score"))
        proba.append(np.array(proba_array))
    return np.array(proba) / np.array(proba).sum(axis=1)[:,None]

In [20]:
bias_detector = LimeKeywordBiasDetector(
    bias_evaluator=LimeBiasEvaluator(n_lime_samples=500),
    n_top_keywords=10,
    use_tokenizer=True
)

In [21]:
test_texts = df.comment_text.sample(50)
detection_res = bias_detector.detect(texts=test_texts, predict_func=predict_sentiment)

2023-04-13 12:04:44.419 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 50 texts...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 41.77it/s]
2023-04-13 12:04:45.620 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 10 texts with protected concepts.
2023-04-13 12:04:45.621 | INFO     | biaslyze.evaluators:evaluate:42 - Started bias detection on 10 samples...
  0%|                                                                                                                                                                                              | 0/10 [01:02<?, ?it/s]


KeyboardInterrupt: 

In [None]:
detection_res.summary()

In [None]:
detection_res.details(group_by_concept=True)

## !! Very Experimental !!: Test masked language model based bias detection with keywords

In [None]:
from biaslyze.bias_detectors import MaskedKeywordBiasDetector
from biaslyze.concept_detectors import KeywordConceptDetector

In [None]:
bias_detector = MaskedKeywordBiasDetector(n_resample_keywords=15, use_tokenizer=True)

In [None]:
detection_res = bias_detector.detect(
    texts=df.comment_text.sample(1000),
    predict_func=predict_sentiment
)

In [None]:
detection_res.summary()

In [None]:
detection_res.details()