In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pwd

/home/stinal/Documents/BIASLYZE/repositories/biaslyze/notebooks


In [3]:
import sys
sys.path.append('/home/stinal/Documents/BIASLYZE/repositories/biaslyze/')

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Load and prepare data

In [5]:
df = pd.read_csv("../data/movie-reviews/movie.csv"); df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [6]:
clf = make_pipeline(TfidfVectorizer(min_df=10, max_features=10000, stop_words="english"), LogisticRegression())

In [7]:
clf.fit(df.text, df.label)

In [8]:
train_pred = clf.predict(df.text)
print(accuracy_score(df.label, train_pred))

0.920625


# Test concept detection

In [9]:
from biaslyze.concept_detectors import KeywordConceptDetector
from biaslyze.evaluators import LimeBiasEvaluator
from biaslyze.bias_detectors import LimeKeywordBiasDetector

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
key_detect = KeywordConceptDetector()

In [11]:
detected_reviews = key_detect.detect(texts=df.text[:600])

2023-04-14 17:19:44.297 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 600 texts...
100%|████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 7693.24it/s]
2023-04-14 17:19:44.378 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 599 texts with protected concepts.


In [12]:
len(detected_reviews)

599

# Test LIME Bias Detection via Keywords

In [13]:
bias_detector = LimeKeywordBiasDetector(bias_evaluator=LimeBiasEvaluator(n_lime_samples=500), n_top_keywords=30, use_tokenizer=True)

In [14]:
detection_res = bias_detector.detect(texts=df.text.sample(500), predict_func=clf.predict_proba)

2023-04-14 17:19:44.452 | INFO     | biaslyze.concept_detectors:detect:33 - Started keyword-based concept detection on 500 texts...
100%|██████████████████████████████████████████████████████████████████████████| 500/500 [00:12<00:00, 40.35it/s]
2023-04-14 17:19:56.847 | INFO     | biaslyze.concept_detectors:detect:49 - Done. Found 380 texts with protected concepts.
2023-04-14 17:19:56.847 | INFO     | biaslyze.evaluators:evaluate:44 - Started bias detection on 380 samples...
100%|██████████████████████████████████████████████████████████████████████████| 380/380 [01:57<00:00,  3.23it/s]


In [15]:
detection_res.summary()

Detected 142 samples with potential issues.
    Potentially problematic concepts detected: [('gender', 128), ('nationality', 14), ('religion', 4)]
    Based on keywords: [('his', 18), ('he', 16), ('man', 14), ('her', 11), ('woman', 10), ('guy', 9), ('women', 9), ('men', 7), ('she', 7), ('mother', 6), ('father', 6), ('guys', 6), ('boys', 5), ('girl', 4), ('girls', 4), ('wife', 3), ('female', 3), ('christian', 3), ('boy', 3), ('england', 3)].


In [16]:
detection_res.details(group_by_concept=True)

Concept: gender
[{'reason': ['men'],
  'text': 'Warner Brothers tampered considerably with American history in "Big '
          'Trail" director Raoul Walsh\'s first-rate western "They Died with '
          'Their Boots On," a somewhat inaccurate but wholly exhilarating '
          'biography of cavalry officer George Armstrong Custer. The film '
          'chronicles Custer from the moment that he arrives at West Point '
          'Academy until the Indians massacre him at the Little Big Horn. This '
          "is one of Errol Flynn's signature roles and one of Raoul Walsh's "
          'greatest epics. Walsh and Flynn teamed in quite often afterward, '
          'and "They Died with Their Boots On" reunited Olivia de Havilland as '
          "Flynn's romantic interest for the last time. They appeared as a "
          'couple in seven previous films. This 140-minute, black & white '
          'oater is nothing short of brilliant with dynamic action sequences, '
          'humorous rom

In [17]:
from bokeh.io import show, output_notebook

output_notebook()

dashboard = detection_res.dashboard()
show(dashboard)

In [18]:
pos_dashboard = detection_res.dashboard(use_position=True)
show(pos_dashboard)