# Test biaslyze with disaster tweets data

Data source: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/tobias/Repositories/biaslyze/')

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

## Load and prepare data

In [4]:
df = pd.read_csv("../data/disaster-tweets/train.csv"); df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# replace urls
import re
url_regex = re.compile("(http|https)://[\w\-]+(\.[\w\-]+)+\S*")

df = df.replace(to_replace=url_regex, value='', regex=True)

## Train a model

In [101]:
clf = make_pipeline(TfidfVectorizer(min_df=10, max_features=10000, stop_words="english"), LogisticRegression(n_jobs=4))

In [102]:
clf.fit(df.text, df.target)

In [103]:
train_pred = clf.predict(df.text)
print(accuracy_score(df.target, train_pred))

0.8418494680152371


## Test detection of concepts

In [104]:
from biaslyze.concept_detectors import KeywordConceptDetector
from biaslyze.evaluators import LimeBiasEvaluator

In [105]:
key_detect = KeywordConceptDetector()

In [106]:
detected_tweets = key_detect.detect(texts=df.text[:600])

2023-03-07 15:25:29.113 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 600 texts...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 71931.13it/s]
2023-03-07 15:25:29.130 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 377 texts with protected concepts.


In [107]:
len(detected_tweets)

377

In [88]:
detected_tweets

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
 '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
 "I'm on top of the hill and I can see a fire in the woods...",
 "There's an emergency evacuation happening now in the building across the street",
 "I'm afraid that the tornado is coming to our area...",
 'Three people died from the heat wave so far',
 '#Flood in Bago Myanmar #We arrived Bago',
 "What's up man?",
 'this is ridiculous....',
 'Love my girlfriend',
 'The end!',
 'We always try to bring the he

## Test LIME based bias detection with keywords

In [27]:
from biaslyze.bias_detectors import LimeKeywordBiasDetector

In [99]:
bias_detector = LimeKeywordBiasDetector(bias_evaluator=LimeBiasEvaluator(n_lime_samples=5000), n_top_keywords=5, use_tokenizer=True)

In [109]:
detection_res = bias_detector.detect(texts=df.text.sample(frac=0.1), predict_func=clf.predict_proba)

2023-03-07 15:26:13.225 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 3806 texts...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3806/3806 [00:45<00:00, 84.09it/s]
2023-03-07 15:26:58.492 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 519 texts with protected concepts.
2023-03-07 15:26:58.493 | INFO     | biaslyze.evaluators:evaluate:41 - Started bias detection on 519 samples...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 519/519 [16:42<00:00,  1.93s/it]


In [110]:
detection_res.summary()

Detected 228 samples with potential issues.
    Potentially problematic concepts detected: [('nationality', 151), ('gender', 73), ('religion', 11)]
    Based on keywords: [('california', 51), ('japan', 28), ('boy', 16), ('man', 14), ('woman', 13), ('israeli', 12), ('pakistan', 10), ('russia', 10), ('israel', 10), ('australia', 8), ('muslims', 8), ('girl', 8), ('he', 7), ('christian', 6), ('turkish', 6), ('his', 6), ('she', 5), ('myanmar', 5), ('taiwan', 5), ('her', 4)].


In [111]:
detection_res.details(group_by_concept=True)

Concept: nationality
[{'reason': ['israeli'],
  'text': 'The #Palestinian #refugee tragedy is a blight on humanity &amp; '
          'should shame every #Israeli for living with it. '},
 {'reason': ['australia', 'japan'],
  'text': '#Australia #News ; #Japan marks 70th anniversary of #Hiroshima '
          'atomic bombing   READ MORE; '},
 {'reason': ['australia'],
  'text': 'Ashes 2015: Australia\x89Ûªs collapse at Trent Bridge among worst '
          'in history: England bundled out Australia for 60 ... '},
 {'reason': ['california'],
  'text': 'The Latest: More Homes Razed by Northern California Wildfire - ABC '
          'News '},
 {'reason': ['israeli', 'muslims', 'christian'],
  'text': 'Christian Attacked by Muslims at the Temple Mount after Waving '
          'Israeli Flag via Pamela Geller - ... '},
 {'reason': ['california'],
  'text': '@aria_ahrary @TheTawniest The out of control wild fires in '
          'California even in the Northern part of the state. Very troubling.'},

## Testing a sentiment analysis model from huggingface

In [92]:
from transformers import pipeline
from torch.utils.data import Dataset


classifier = pipeline(
    model="distilbert-base-uncased-finetuned-sst-2-english",
    top_k=None,
    padding=True,
    truncation=True
)

In [112]:
class MyDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]


def predict_sentiment(texts):
    data = MyDataset(texts)
    proba = []
    for res in classifier(data):
        proba_array = []
        for p in sorted(res, key=lambda d: d['label'], reverse=True):
            proba_array.append(p.get("score"))
        proba.append(np.array(proba_array))
    return np.array(proba) / np.array(proba).sum(axis=1)[:,None]

In [113]:
bias_detector = LimeKeywordBiasDetector(
    bias_evaluator=LimeBiasEvaluator(n_lime_samples=500),
    n_top_keywords=10,
    use_tokenizer=True
)

In [114]:
test_texts = detected_tweets[:10]
detection_res = bias_detector.detect(texts=test_texts, predict_func=predict_sentiment)

2023-03-07 15:45:07.759 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 10 texts...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 91.80it/s]
2023-03-07 15:45:07.875 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 2 texts with protected concepts.
2023-03-07 15:45:07.876 | INFO     | biaslyze.evaluators:evaluate:41 - Started bias detection on 2 samples...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:38<00:00, 19.22s/it]


In [115]:
detection_res.summary()

Detected 1 samples with potential issues.
    Potentially problematic concepts detected: [('nationality', 1)]
    Based on keywords: [('california', 1)].


In [116]:
detection_res.details(group_by_concept=True)

Concept: nationality
[{'reason': ['california'],
  'text': '13,000 people receive #wildfires evacuation orders in California '}]


## !! Very Experimental !!: Test masked language model based bias detection with keywords

In [123]:
from biaslyze.bias_detectors import MaskedKeywordBiasDetector

In [124]:
bias_detector = MaskedKeywordBiasDetector(n_resample_keywords=15, use_tokenizer=True)

In [125]:
detection_res = bias_detector.detect(texts=df.text[500:600], predict_func=predict_sentiment)

2023-03-07 18:08:11.565 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 100 texts...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 75.50it/s]
2023-03-07 18:08:12.896 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 15 texts with protected concepts.
2023-03-07 18:08:12.897 | INFO     | biaslyze.evaluators:evaluate:96 - Started bias detection on 15 samples...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:34<00:00,  2.30s/it]


In [126]:
detection_res.summary()

Detected 2 samples with potential issues.
    Potentially problematic concepts detected: [('gender', 1), ('nationality', 1)]
    Based on keywords: [('he', 1), ('she', 1), ('australia', 1)].


In [127]:
detection_res.details()

''@eunice_njoki aiii she needs to chill and answer calmly its not like she's being attacked'' might contain bias ['gender']; reasons: ['he', 'she']
''#LonePine remembered around Australia as 'descendants' grow via @666canberra #Gallipoli #WW1
 '' might contain bias ['nationality']; reasons: ['australia']
