In [27]:
%load_ext autoreload
%autoreload 2

In [28]:
import sys
sys.path.append('/home/tobias/Repositories/biaslyze/')

In [29]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

## Load and prepare data

In [43]:
df = pd.read_csv("../data/disaster-tweets/train.csv"); df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [44]:
# replace urls
import re
url_regex = re.compile("(http|https)://[\w\-]+(\.[\w\-]+)+\S*")

df = df.replace(to_replace=url_regex, value='', regex=True)

## Train a model

In [67]:
clf = make_pipeline(TfidfVectorizer(min_df=10, max_features=10000, stop_words="english"), LogisticRegression())

In [68]:
clf.fit(df.text, df.target)

In [69]:
train_pred = clf.predict(df.text)
print(accuracy_score(df.target, train_pred))

0.8418494680152371


## Test detection of concepts

In [74]:
from biaslyze.concept_detectors import KeywordConceptDetector
from biaslyze.evaluators import LimeBiasEvaluator

from biaslyze.bias_detectors import KeywordBiasDetector

In [63]:
key_detect = KeywordConceptDetector()

In [64]:
detected_tweets, detected_labels = key_detect.detect(texts=df.text, labels=df.target)

In [65]:
len(detected_tweets), len(detected_labels)

(505, 505)

In [101]:
bias_detector = KeywordBiasDetector(predict_func=clf.predict_proba)

In [102]:
detection_res = bias_detector.detect(texts=df.text, labels=df.target)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 505/505 [00:23<00:00, 21.41it/s]


In [96]:
detection_res

[('13,000 people receive #wildfires evacuation orders in California ',
  ['california',
   'evacuation',
   '000 people',
   'in california',
   'receive wildfires',
   'receive',
   '13',
   'orders',
   '13 000',
   'wildfires']),
 ('#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
  ['california',
   'update',
   'hwy',
   'wildfires',
   'in both',
   'california hwy',
   'directions due',
   'closed',
   '20 closed',
   'cafire wildfires']),
 ('#Flood in Bago Myanmar #We arrived Bago',
  ['myanmar',
   'flood',
   'arrived bago',
   'bago',
   'in',
   'arrived',
   'bago myanmar',
   'myanmar we',
   'we arrived',
   'in bago']),
 ('How the West was burned: Thousands of wildfires ablaze in California alone ',
  ['california',
   'thousands',
   'wildfires',
   'wildfires ablaze',
   'burned',
   'of wildfires',
   'was burned',
   'of',
   'how',
   'ablaze']),
 ('How the West was burned: Thousands of wildfires ablaz

In [93]:
clf.predict_proba(["Nice house in france"])

array([[0.70641419, 0.29358581]])