# Race and Gender and Location Callouts

Are race and gender and location callouts more likely to be misunderstood?

### Load Data

In [1]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import Cluster
import pandas as pd
import numpy as np
import librosa

In [3]:
etl = BpcETL(Cluster.RCC, 
    filter_inaudible=False, 
    filter_numeric=False, 
    filter_uncertain=False,
    ambiguity='ALL')

In [4]:
data = etl.extract()

In [55]:
# This is meant for a quick check and a quick minimum estimate,
# not an exhaustive search for gender or racial callouts. 
GENDER_WORDS = ['man','woman','boy','girl','male','female']
RACE_WORDS = ['black','white','latino','asian']
LOCATION_WORDS = ['street','alley','park','building','road','highway','ave','avenue']

In [56]:
# Check data for callouts
data = data.assign(location=data['text'].str.contains(' | '.join(LOCATION_WORDS), regex=True, case=False),
                gender=data['text'].str.contains(' | '.join(GENDER_WORDS), regex=True, case=False),
                race=data['text'].str.contains(' | '.join(RACE_WORDS), regex=True, case=False))

In [57]:
# Check data for uncertainty
data = data.assign(uncertain=data['text'].str.contains(' | '.join(BpcETL.BAD_WORDS), regex=True, case=False) |
                            data['text'].str.contains("\[.+\]", regex=True))

In [59]:
def confusion(col, data):
    yy = sum(data[col].astype(bool) & data['uncertain'].astype(bool))
    yn = sum(data[col].astype(bool) & ~data['uncertain'].astype(bool))
    ny = sum(~data[col].astype(bool) & data['uncertain'].astype(bool))
    nn = sum(~data[col].astype(bool) & ~data['uncertain'].astype(bool))
    print(f"{col} + Uncertain = {yy}")
    print(f"{col} + Not Uncertain = {yn}")
    print(f"Not {col} + Uncertain = {ny}")
    print(f"Not {col} + Not Uncertain = {nn}")

confusion('gender', data)
confusion('race', data)
confusion('location', data)

gender + Uncertain = 698
gender + Not Uncertain = 1431
Not gender + Uncertain = 13057
Not gender + Not Uncertain = 46611
race + Uncertain = 396
race + Not Uncertain = 745
Not race + Uncertain = 13359
Not race + Not Uncertain = 47297
location + Uncertain = 430
location + Not Uncertain = 821
Not location + Uncertain = 13325
Not location + Not Uncertain = 47221


In [60]:
data = data.assign(robert=data['text'].str.contains("robert", case=False))

In [61]:
dat = data[(~data['robert'].astype(bool)) & (data['duration']>1)]
confusion('gender', dat)
confusion('race', dat)
confusion('location', dat)

gender + Uncertain = 636
gender + Not Uncertain = 1283
Not gender + Uncertain = 8888
Not gender + Not Uncertain = 24521
race + Uncertain = 361
race + Not Uncertain = 636
Not race + Uncertain = 9163
Not race + Not Uncertain = 25168
location + Uncertain = 407
location + Not Uncertain = 731
Not location + Uncertain = 9117
Not location + Not Uncertain = 25073
