# Test biaslyze with disaster tweets data

Data source: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/tobias/Repositories/biaslyze/')

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

## Load and prepare data

In [4]:
df = pd.read_csv("../data/disaster-tweets/train.csv"); df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df.shape

(7613, 5)

In [6]:
# replace urls
import re
url_regex = re.compile("(http|https)://[\w\-]+(\.[\w\-]+)+\S*")

df = df.replace(to_replace=url_regex, value='', regex=True)

## Train a model

In [7]:
clf = make_pipeline(TfidfVectorizer(min_df=10, max_features=10000, stop_words="english"), LogisticRegression(n_jobs=4))

In [8]:
clf.fit(df.text, df.target)

In [9]:
train_pred = clf.predict(df.text)
print(accuracy_score(df.target, train_pred))

0.8418494680152371


## Test detection of concepts

In [10]:
from biaslyze.concept_detectors import KeywordConceptDetector
from biaslyze.evaluators import LimeBiasEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
key_detect = KeywordConceptDetector()

In [12]:
detected_tweets = key_detect.detect(texts=df.text[:600])

2023-04-14 16:12:53.023 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 600 texts...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 58310.91it/s]
2023-04-14 16:12:53.040 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 412 texts with protected concepts.


In [13]:
len(detected_tweets)

412

In [14]:
detected_tweets

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in California ',
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
 '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
 "I'm on top of the hill and I can see a fire in the woods...",
 "There's an emergency evacuation happening now in the building across the street",
 "I'm afraid that the tornado is coming to our area...",
 'Three people died from the heat wave so far',
 '#Flood in Bago Myanmar #We arrived Bago',
 "What's up man?",
 'this is ridiculous....',
 'Love my girlfriend',
 'The end!',
 'We always try to bring the he

## Test LIME based bias detection with keywords

In [15]:
from biaslyze.bias_detectors import LimeKeywordBiasDetector

In [16]:
bias_detector = LimeKeywordBiasDetector(bias_evaluator=LimeBiasEvaluator(n_lime_samples=500), n_top_keywords=30, use_tokenizer=True)

In [17]:
detection_res = bias_detector.detect(texts=df.text.sample(frac=0.3), predict_func=clf.predict_proba)

2023-04-14 16:12:57.971 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 2284 texts...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2284/2284 [00:22<00:00, 101.61it/s]
2023-04-14 16:13:20.454 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 397 texts with protected concepts.
2023-04-14 16:13:20.454 | INFO     | biaslyze.evaluators:evaluate:44 - Started bias detection on 397 samples...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 397/397 [01:05<00:00,  6.05it/s]


In [18]:
detection_res.summary()

Detected 362 samples with potential issues.
    Potentially problematic concepts detected: [('gender', 205), ('nationality', 148), ('religion', 20)]
    Based on keywords: [('he', 46), ('california', 41), ('man', 29), ('her', 25), ('his', 24), ('japan', 22), ('she', 20), ('woman', 12), ('pakistan', 9), ('saudi', 9), ('mom', 9), ('china', 9), ('women', 9), ('israeli', 8), ('men', 8), ('girl', 8), ('ladies', 7), ('lady', 7), ('israel', 7), ('muslims', 6)].


In [19]:
detection_res.details(group_by_concept=True)

Concept: nationality
[{'reason': ['german'],
  'text': 'Gut Deutsch musik! The old and rotten the monarchy has collapsed. '
          'The new may live. Long live the German Republic! '},
 {'reason': ['california'],
  'text': "'CALIFORNIA IS BURNING:' Gov. Jerry Brown told reporters at a press "
          'conference that California is experiencing... '},
 {'reason': ['japan'],
  'text': 'Japan FUSO Class Battleship YAMASHIRO Naval Cover 1999 PHOTO Cachet '
          'SUNK WWII  '},
 {'reason': ['myanmar'],
  'text': '69 Dead Due to Floods in Myanmar: Naypyidaw Aug 5 (Prensa Latina) '
          'The death toll rose today to 69 in Myanmar... '},
 {'reason': ['japan'],
  'text': "Earthquake and tsunami that occurred in Japan 'free speech' is also "
          'swallowed. '},
 {'reason': ['california'],
  'text': 'DEEP crew to help with California wild fires  via @thedayct'},
 {'reason': ['myanmar'],
  'text': 'Myanmar floods: Childfund \n'
          ' and International Needs  and CARE Aus

In [21]:
from bokeh.io import show, output_notebook

output_notebook()

dashboard = detection_res.dashboard()
show(dashboard)

In [23]:
pos_dashboard = detection_res.dashboard(use_position=True)
show(pos_dashboard)

## Testing a sentiment analysis model from huggingface

In [25]:
from transformers import pipeline
from torch.utils.data import Dataset


classifier = pipeline(
    model="distilbert-base-uncased-finetuned-sst-2-english",
    top_k=None,
    padding=True,
    truncation=True
)

In [26]:
class MyDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]


def predict_sentiment(texts):
    data = MyDataset(texts)
    proba = []
    for res in classifier(data):
        proba_array = []
        for p in sorted(res, key=lambda d: d['label'], reverse=True):
            proba_array.append(p.get("score"))
        proba.append(np.array(proba_array))
    return np.array(proba) / np.array(proba).sum(axis=1)[:,None]

In [27]:
bias_detector = LimeKeywordBiasDetector(
    bias_evaluator=LimeBiasEvaluator(n_lime_samples=500),
    n_top_keywords=10,
    use_tokenizer=True
)

In [28]:
test_texts = detected_tweets[:10]
detection_res = bias_detector.detect(texts=test_texts, predict_func=predict_sentiment)

2023-04-13 10:09:41.105 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 10 texts...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 109.41it/s]
2023-04-13 10:09:41.200 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 2 texts with protected concepts.
2023-04-13 10:09:41.201 | INFO     | biaslyze.evaluators:evaluate:42 - Started bias detection on 2 samples...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:38<00:00, 19.30s/it]


In [29]:
detection_res.summary()

Detected 1 samples with potential issues.
    Potentially problematic concepts detected: [('nationality', 1)]
    Based on keywords: [('california', 1)].


In [30]:
detection_res.details(group_by_concept=True)

Concept: nationality
[{'reason': ['california'],
  'text': '13,000 people receive #wildfires evacuation orders in California '}]


## !! Very Experimental !!: Test masked language model based bias detection with keywords

In [31]:
from biaslyze.bias_detectors import MaskedKeywordBiasDetector

In [466]:
bias_detector = MaskedKeywordBiasDetector(n_resample_keywords=20, use_tokenizer=True)

In [467]:
masked_detection_res = bias_detector.detect(texts=df.sample(1000).text, predict_func=clf.predict_proba)

2023-04-14 11:59:35.431 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 1000 texts...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:10<00:00, 96.34it/s]
2023-04-14 11:59:45.817 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 170 texts with protected concepts.
2023-04-14 11:59:45.817 | INFO     | biaslyze.evaluators:evaluate:109 - Started bias detection on 170 samples...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 170/170 [00:10<00:00, 17.00it/s]


In [468]:
masked_detection_res.summary()

Detected 91 samples with potential issues.
    Potentially problematic concepts detected: [('gender', 49), ('nationality', 42), ('religion', 1)]
    Based on keywords: [('he', 12), ('man', 10), ('california', 10), ('his', 8), ('japan', 7), ('she', 6), ('her', 5), ('boy', 4), ('women', 3), ('turkish', 3), ('china', 3), ('woman', 3), ('german', 3), ('guy', 2), ('ladies', 2), ('daughter', 2), ('myanmar', 2), ('men', 2), ('russia', 2), ('palestine', 2)].


In [469]:
masked_detection_res.details()

''Another white guy trying to mass murder people for no apparent reason just because let me guess he's mentally ill blah blah blah #Antioch'' might contain bias ['gender']; reasons: ['he', 'guy']
''2015 new fashion ladies gold watch waterproof WeiQin famous brand michel quartz de lujo caÛ_  '' might contain bias ['gender']; reasons: ['ladies']
''@O_Magazine satan's daughter shadow warrior in 50ft women aka transgender mode ps nyc is about to fold extra extra center of bioterrorism'' might contain bias ['gender']; reasons: ['women', 'daughter']
''Turkish troops killed in Kurdish militant 'suicide attack' '' might contain bias ['nationality']; reasons: ['turkish']
''Chinese rescue team arrives in Myanmar to help flood victims  Sittway '' might contain bias ['nationality']; reasons: ['myanmar', 'chinese']
''@APANO55 @JamesMelville 99% of Scientists donÛªt believe in Catastrophic Man-Made Global Warming only the deluded do.'' might contain bias ['gender']; reasons: ['man', 'man']
''ÛÏ@B

In [471]:
from bokeh.io import show, output_notebook

output_notebook()

dashboard = masked_detection_res.dashboard()
show(dashboard)