# CrowS-Pairs

In [11]:
from metric import read_data

In [12]:
from nltk.corpus import stopwords
from pororo import Pororo

In [13]:
crows_data = read_data("data/crows_pairs_anonymized.csv")

### Datatypes
- crows_data : (df) pandas dataframe of crows pairs data
- crows_sent : (dict) dictionary of sentences with keys of bias type
- crows_word : (dict) dictionary of words by its count
- crows_word_list : (list) list of words ordered by its count

### Bias types
1. race-color
2. gender
3. sexual-orientation
4. religion
5. age
6. nationality
7. disability
8. physical-appearance
9. socioeconomic

### TODO
1. Names
    - name total
    - name gender
    - name race
   
2. Occupations
3. Frequently used adjectives, nouns, and verbs

In [14]:
crows_sent = {}
for _ , data in crows_data.iterrows():
    bias = data['bias_type']
    if bias not in crows_sent.keys():
        crows_sent[bias] = list()
#         print(f"{bias}")
    crows_sent[bias].append(data['sent1'])
    crows_sent[bias].append(data['sent2'])
len(crows_sent['gender'])

524

In [14]:
# param : (str) bias type
# return : (list) of words in certain bais type in descending order
def get_word_list(df_data, bias_type):
    counts = dict()
    for index, data in df_data.iterrows():
        if data['bias_type'] == bias_type:
            sent1 = data['sent1']
            sent2 = data['sent2']
            sent1 = sent1.strip().split()
            sent2 = sent2.strip().split()
            word_set = set()
            for word1, word2 in zip(sent1, sent2):
                word_set.add(word1)
                word_set.add(word2)
            for word in word_set:
                counts[word] = counts.get(word,0)+1
    word_list = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    print(f"{bias_type} word count : {len(word_list)}")
    return word_list

In [15]:
# param : (list) word list to remove stopwords
# return : (list) word list with stopwords removed
def remove_stopwords(word_list):
    sw = stopwords.words('english')
    sw.append("The")
    sw.append("I")
    sw.append("A")
    no_stopwords = []
    for word in word_list:
        if word[0] not in sw:
            no_stopwords.append(word)
    return no_stopwords

In [17]:
import nltk
nltk.download('stopwords')
crows_word = {}
crows_word = dict()
for index, data in crows_data.iterrows():
    sent1 = data['sent1']
    sent2 = data['sent2']
    sent1 = sent1.strip().split()
    sent2 = sent2.strip().split()
    word_set = set()
    for word1, word2 in zip(sent1, sent2):
        word_set.add(word1)
        word_set.add(word2)
    for word in word_set:
        crows_word[word] = crows_word.get(word,0)+1
crows_word = sorted(crows_word.items(), key=lambda x: x[1], reverse=True)
crows_word = remove_stopwords(crows_word)

print(crows_word[:20])
crows_word_list = [word[0].lower() for word in crows_word]
print(f"Word count : {len(crows_word_list)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/boychaboy/nltk_data...


[('white', 207), ('black', 170), ('people', 154), ('He', 150), ('man', 118), ('like', 96), ('would', 95), ('She', 93), ('White', 77), ('American', 77), ('always', 71), ('poor', 67), ('rich', 47), ('could', 47), ('Americans', 47), ('get', 45), ('men', 44), ('young', 41), ('got', 40), ('old', 40)]
Word count : 5043


[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
gender_words = remove_stopwords(get_word_list(crows_data, 'gender'))

## 1. Names
### 1.1 Names in all category

In [None]:
# param : (list) of words
# return : (list) of names in descending order of frequency
# def get_names(word_list):
ner = Pororo(task="ner", lang="en")

In [None]:
def get_name_list(sent_list):
    name_list = []
    for sent in sent_list:
        for token in ner(sent):
            if token[1] == 'PERSON':
                name_list.append(token[0])
    return name_list

In [None]:
name_dict = {}
for bias in crows_sent.keys():
    for sent in crows_sent[bias]:
        for token in ner(sent):
            if token[1] == 'PERSON':
                if token[0] not in name_dict.keys():
                    name_dict[token[0]] = 1
                else:
                    name_dict[token[0]] += 1

In [None]:
# sort in descending order
name_tuple = sorted(name_dict.items(), key=lambda x: x[1], reverse=True)
name_list = [name[0] for name in name_tuple]
name_list

In [None]:
name_dict

In [None]:
import pandas as pd

def save_dict_to_csv(data, filename):
    dict_sorted = sorted(data.items(), key=lambda x:x[1], reverse=True)
    df = pd.DataFrame(dict_sorted)
    df.to_csv(f"../data/{filename}.csv", header=False, index=False)
    print(f"file saved in ../data/{filename}.csv")
    return

save_dict_to_csv(name_dict, "crows_name")
# df = pd.DataFrame(name_list)
# df.to_csv("../data/crows_name.csv", header=False, index=False)

### 1.2 Names in each bias categories

In [None]:
name_data = {}
for _, data in crows_data.iterrows():
    bias = data['bias_type']
    name1 = None; name2 = None
    if bias not in name_data.keys():
        name_data[bias] = list()
    for token in ner(data['sent1']):
        if token[1] == 'PERSON':
            name1 = token[0]
    for token in ner(data['sent2']):
        if token[1] == 'PERSON':
            name2 = token[0]
    if name1 or name2:
        name_data[bias].append((name1, name2))

In [None]:
# number of pairs in each bias categories
for bias in name_data.keys():
    print(f"{bias} : {len(name_data[bias])} pairs")

In [None]:
name_data['gender']

## 2. Occupations

### 2.1 Winogender Occupation
- crows_wino_occupation : (list)

In [1]:
import pandas as pd
wino_dir = "../data_analysis/winogender-schemas/data/occupations-stats.tsv"
wino_occupation = pd.read_csv(wino_dir, delimiter='\t')

FileNotFoundError: [Errno 2] No such file or directory: '../data/winogender-schemas/data/occupations-stats.tsv'

In [None]:
wino_occupation_list = list(wino_occupation['occupation'])
wino_occupation_list

In [None]:
occupation_dict = {}
for bias in crows_sent.keys():
    for sent in crows_sent[bias]:
        for word in sent.strip():
            if word in wino_occupation_list:
                if token[0] not in name_dict.keys():
                    occupation_dict[token[0]] = 1
                else:
                    occupation_dict[token[0]] += 1

In [None]:
crows_occupation = {}
for occupation in wino_occupation_list:
    if occupation in crows_word_list:
#         crows_occupation.append(occupation)
        crows_occupation[occupation] = crows_occupation.get(occupation,0)+1

In [None]:
crows_wino_occupation = [word for word in crows_occupation]

### 2.1 Occupation Total

In [3]:
occupation_dir = "../data_analysis/occupations.csv"
f = open(occupation_dir, 'r')
occupation_df = pd.read_csv(f)

In [4]:
occupation_list = [occupation.lower() for occupation in list(occupation_df['Occupations'])]

In [6]:
len(occupation_list)

1155

In [7]:
crows_occupation = {}
for occupation in occupation_list:
    if occupation in crows_word_list:
#         crows_occupation.append(occupation)
        crows_occupation[occupation] = crows_occupation.get(occupation,0)+1

NameError: name 'crows_word_list' is not defined

In [None]:
print(len(crows_occupation))
crows_occupation = [occ for occ in crows_occupation.keys()]
crows_occupation[:10]

In [15]:
crows_occupation2 = {}
for bias in crows_sent.keys():
    for sent in crows_sent[bias]:
        for occupation in occupation_list:
            if occupation in sent:
                crows_occupation2[occupation] = crows_occupation2.get(occupation,0)+1

In [16]:
crows_occupation2 = sorted(crows_occupation2.items(), key=lambda x: x[1], reverse=True)
crows_occupation_total = [occ[0] for occ in crows_occupation2]

In [18]:
crows_occupation2

[('doctor', 42),
 ('student', 20),
 ('teacher', 19),
 ('engineer', 15),
 ('driver', 14),
 ('actor', 13),
 ('pilot', 12),
 ('lawyer', 12),
 ('police officer', 8),
 ('scientist', 7),
 ('maid', 6),
 ('janitor', 6),
 ('soldier', 6),
 ('employee', 5),
 ('barber', 4),
 ('cashier', 4),
 ('farmer', 4),
 ('sheriff', 4),
 ('chef', 4),
 ('artist', 4),
 ('mechanic', 3),
 ('landlord', 3),
 ('nurse', 3),
 ('executive', 3),
 ('waiter', 3),
 ('publican', 2),
 ('housekeeper', 2),
 ('astronomer', 2),
 ('arts', 2),
 ('magician', 2),
 ('grocer', 2),
 ('medical student', 2),
 ('airman', 2),
 ('radiologist', 2),
 ('foster parent', 2),
 ('employment', 2),
 ('salesman', 2),
 ('builder', 2),
 ('physicist', 2),
 ('inspector', 2),
 ('lawn mower', 2),
 ('agent', 2),
 ('aeronautical engineer', 2),
 ('software engineer', 2),
 ('supervisor', 2),
 ('fireman', 2),
 ('security guard', 2),
 ('groom', 2),
 ('butcher', 2),
 ('dealer', 2),
 ('plumber', 2),
 ('postman', 2),
 ('lifeguard', 2),
 ('priest', 2),
 ('instructor',

In [17]:
len(crows_occupation_total)

65

In [None]:
df = pd.DataFrame(crows_occupation_total)
df.to_csv("../data/crows_occupation.csv", header=False, index=False)

In [None]:
crows_occupation_total

## 3. Frequently used adjectives, nouns, and verbs

### 3.1 Total

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
tagger = SequenceTagger.load("flair/pos-english")

In [None]:
sentence = Sentence(crows_sent['gender'][0])
tagger.predict(sentence)
sentence.to_tagged_string()

In [None]:
# for entity in sentence.get_spans('pos'):
#     print(entity)
sentence.get_spans('pos')[1].tokens[0].text

In [None]:
NN = dict() #noun
VB = dict() #verb
JJ = dict() #adjective

for bias in crows_sent.keys():
    for sent in crows_sent[bias]:
        sentence = Sentence(sent)
        tagger.predict(sentence)
        for word in sentence.get_spans('pos'):
            tok = word.tokens[0].text
            pos = word.tag
            if pos.startswith('NN'):
                print(f"{tok}, {pos}")
                NN[tok] = NN.get(tok,0)+1
            elif pos.startswith('VB'):
                print(f"{tok}, {pos}")
                VB[tok] = VB.get(tok,0)+1
            elif pos.startswith('JJ'):
                print(f"{tok}, {pos}")
                JJ[tok] = JJ.get(tok,0)+1

In [None]:
save_dict_to_csv(NN, "crows_noun")
save_dict_to_csv(VB, "crows_verb")
save_dict_to_csv(JJ, "crows_adj")

In [None]:
save_to_json(NN, "crows_noun")
save_to_json(VB, "crows_noun")
save_to_json(JJ, "crows_noun")

In [None]:
import json

def save_to_json(data, filename):
    json.dump(data, open(f"../data/json/{filename}.json", 'w'))
    print(f"file saved in ../data/json/{filename}.json")
    return 

In [None]:
save_to_json(crows_sent, "crows_sent")
save_to_json(crows_word, "crows_word")