In [1]:
import jsonlines 
import os 
import pandas as pd 
from collections import Counter 
import spacy
from tqdm import tqdm 
import tokenizations

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.sql import SparkSession, Row

In [2]:
tqdm.pandas()

In [3]:
os.listdir('../data')

['._balanced_BUG.csv', 'balanced_BUG.csv', 'gold_BUG.csv', 'full_BUG.csv']

In [4]:
full = pd.read_csv('../data/full_BUG.csv')
gold = pd.read_csv('../data/gold_BUG.csv')
balanced = pd.read_csv('../data/balanced_BUG.csv')

## Check sentences in BUG gold

In [7]:
gold[gold['sentence_text'].isna()]

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns


In [110]:
full[(full["g_first_index"] == 20) & (full["profession_first_index"] == 15) & (full['profession'] == "officer") & (full["data_index"] == 3)]

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
94343,27232,German journalists Ansgar Graw and Frank Herma...,officer,himself,15,20,male,1,5,1,wikipedia,3


In [6]:
gold.at[645, "sentence_text"] = full.loc[94343].sentence_text

In [21]:
gold['split'] = gold['sentence_text'].str.split()
gold['length'] = gold['split'].apply(len)
gold[gold['length'] - 1 < gold['g_first_index']]

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split,length


In [16]:
full[(full['profession'] == "patient") & (full['g_first_index'] == 21) & 
(full["profession_first_index"] == 2) & (full["g"] == "he")
& (full["data_index"] == 11)]
# full.loc[320]['sentence_text']

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
320,1600,"Only one patient had a QTc > 500 ms , which is...",patient,he,2,21,male,0,19,1,covid19,11
5403,6094,"The second patient , a 66-year-old male , deve...",patient,he,2,21,male,0,19,1,pubmed,11
5923,12312,In one patient fatal ventricular fibrillation ...,patient,he,2,21,male,0,19,2,pubmed,11


In [14]:
gold.at[20, "sentence_text"] = full.loc[320].sentence_text

In [19]:
full[(full['profession'] == "patient") & (full['g_first_index'] == 22) & 
(full["profession_first_index"] == 1) & (full["g"] == "she")
& (full["data_index"] == 11)]

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index


In [36]:
gold.drop(1228, inplace=True)
gold.drop(1643, inplace=True)
gold.drop(57, inplace=True) # incomplete sentence

In [119]:
full.loc[38]['sentence_text']

'So yes , that was humbling . Addressing the effects of stigma against HCWs My healthcare assistant was told by her landlord to pack up and leave on a Friday afternoon , after he found out that she worked in our hospital .'

In [64]:
full.loc[38]

Unnamed: 0                                                             2760
sentence_text             So yes , that was humbling . " Addressing the ...
profession                                                        assistant
g                                                                       her
profession_first_index                                                   17
g_first_index                                                            21
predicted gender                                                     female
stereotype                                                                1
distance                                                                  4
num_of_pronouns                                                           3
corpus                                                              covid19
data_index                                                                1
Name: 38, dtype: object

# Tokenize data

In [37]:
df = gold # full, balanced or gold
df.shape 

(1717, 19)

In [38]:
df.head()

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split,length,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession
0,0,Male,1,"My friend , who grew up in Africa , explained ...",friend,he,16,1,0,covid19,4,15,2,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",29,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend
1,1,Male,1,"â€¢ Lastly , for the patient that did not need...",patient,his,17,5,0,covid19,6,12,1,"[â€¢, Lastly, ,, for, the, patient, that, did,...",29,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient
2,2,Female,1,Her early years as a resident doctor in the No...,doctor,her,12,6,-1,covid19,17,6,2,"[Her, early, years, as, a, resident, doctor, i...",18,"[Her, early, years, as, a, resident, doctor, i...",doctor,"[Her, early, years, as, a, resident, doctor, i...",doctor
3,3,Female,1,"Another participant stated , "" Without network...",teacher,she,67,59,1,covid19,4,8,3,"[Another, participant, stated, ,, "", Without, ...",83,"[Another, participant, stated, ,, "", Without, ...",teacher,"[Another, participant, stated, ,, "", Without, ...",and
4,4,Male,1,The patient followed up in the nephrology clin...,patient,his,17,1,0,covid19,17,16,1,"[The, patient, followed, up, in, the, nephrolo...",31,"[The, patient, followed, up, in, the, nephrolo...",patient,"[The, patient, followed, up, in, the, nephrolo...",patient


In [24]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'lemmatizer', 'textcat'])

In [25]:
def safe_list_get(l, idx, default=''):
  try:
    return l[idx]
  except IndexError:
    return default

In [39]:
# tokenize with whitespace 

df["whitespace_token"] = df['sentence_text'].progress_apply(lambda x: x.split())
df['whitespace_profession'] = df.progress_apply(lambda row: safe_list_get(row['whitespace_token'], row['profession_first_index']), axis=1)
df[df['profession'] != df['whitespace_profession']].shape

100%|██████████| 1717/1717 [00:00<00:00, 73405.02it/s]
100%|██████████| 1717/1717 [00:00<00:00, 96458.88it/s]


(82, 19)

In [40]:
# tokenize with spacy

df['spacy_tokens'] = [[token.text for token in sent] for sent in tqdm(nlp.pipe(df['sentence_text'], n_process=32), total=len(df))]
df['spacy_profession'] = df.progress_apply(lambda row: safe_list_get(row['spacy_tokens'], row['profession_first_index']), axis=1)
df[df['profession'] != df['spacy_profession']].shape 

100%|██████████| 1717/1717 [00:01<00:00, 1278.99it/s]
100%|██████████| 1717/1717 [00:00<00:00, 74899.06it/s]


(175, 19)

In [41]:
# tokenize with Spark 

spark = SparkSession.builder.appName("Tokenizer").getOrCreate()
sentenceData = spark.createDataFrame(df)
tokenizer = Tokenizer(inputCol="sentence_text", outputCol="spark_tokens")
wordsData = tokenizer.transform(sentenceData)
data = wordsData.toPandas()

In [42]:
data.head()

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split,length,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens
0,0,Male,1,"My friend , who grew up in Africa , explained ...",friend,he,16,1,0,covid19,4,15,2,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",29,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,"[my, friend, ,, who, grew, up, in, africa, ,, ..."
1,1,Male,1,"â€¢ Lastly , for the patient that did not need...",patient,his,17,5,0,covid19,6,12,1,"[â€¢, Lastly, ,, for, the, patient, that, did,...",29,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,"[â€¢, lastly, ,, for, the, patient, that, did,..."
2,2,Female,1,Her early years as a resident doctor in the No...,doctor,her,12,6,-1,covid19,17,6,2,"[Her, early, years, as, a, resident, doctor, i...",18,"[Her, early, years, as, a, resident, doctor, i...",doctor,"[Her, early, years, as, a, resident, doctor, i...",doctor,"[her, early, years, as, a, resident, doctor, i..."
3,3,Female,1,"Another participant stated , "" Without network...",teacher,she,67,59,1,covid19,4,8,3,"[Another, participant, stated, ,, "", Without, ...",83,"[Another, participant, stated, ,, "", Without, ...",teacher,"[Another, participant, stated, ,, "", Without, ...",and,"[another, participant, stated, ,, "", without, ..."
4,4,Male,1,The patient followed up in the nephrology clin...,patient,his,17,1,0,covid19,17,16,1,"[The, patient, followed, up, in, the, nephrolo...",31,"[The, patient, followed, up, in, the, nephrolo...",patient,"[The, patient, followed, up, in, the, nephrolo...",patient,"[the, patient, followed, up, in, the, nephrolo..."


In [46]:
data['spark_profession'] = data.apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)
data['spark_pronoun'] = data.apply(lambda row: row['spark_tokens'][row['g_first_index']].lower(), axis=1)

data[(data['profession'] != data['spark_profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape 

(26, 22)

In [47]:
data[(data['profession'] != data['spark_profession']) | (data['g'].str.lower() != data['spark_pronoun'])]

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,...,num_of_pronouns,split,length,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens,spark_profession,spark_pronoun
42,42,Male,1,"Farrell , army agent , had his place fired bef...",agent,his,7,4,-1,perseus,...,3,"[Farrell, ,, army, agent, ,, had, his, place, ...",20,"[Farrell, ,, army, agent, ,, had, his, place, ...",",","[Farrell, ,, army, agent, ,, had, his, place, ...",",","[farrell, ,, army, agent, ,, had, his, place, ...",",",place
43,43,Male,1,"Ingraham , formerly a useful officer of the Na...",officer,his,19,6,1,perseus,...,2,"[Ingraham, ,, formerly, a, useful, officer, of...",24,"[Ingraham, ,, formerly, a, useful, officer, of...",of,"[Ingraham, ,, formerly, a, useful, officer, of...",of,"[ingraham, ,, formerly, a, useful, officer, of...",of,services
46,46,Male,1,"Wright , the young professor of a Western coll...",professor,his,50,5,1,perseus,...,4,"[Wright, ,, the, young, professor, of, a, West...",60,"[Wright, ,, the, young, professor, of, a, West...",of,"[Wright, ,, the, young, professor, of, a, West...",of,"[wright, ,, the, young, professor, of, a, west...",of,mouth
49,49,Male,1,He was compelled to drop his prisoners and cap...,officer,his,41,29,1,perseus,...,3,"[He, was, compelled, to, drop, his, prisoners,...",57,"[He, was, compelled, to, drop, his, prisoners,...",officer,"[He, was, compelled, to, drop, his, prisoners,...",officer,"[he, was, compelled, to, drop, his, prisoners,...",officer,skirmishers
52,52,Male,1,Child and his gifted wife .,child,his,3,0,1,perseus,...,1,"[Child, and, his, gifted, wife, .]",6,"[Child, and, his, gifted, wife, .]",Child,"[Child, and, his, gifted, wife, .]",Child,"[child, and, his, gifted, wife, .]",child,gifted
54,54,Male,1,"When visiting a friend , he was invited to tak...",friend,he,6,26,0,perseus,...,3,"[When, visiting, a, friend, ,, he, was, invite...",33,"[When, visiting, a, friend, ,, he, was, invite...",suggested,"[When, visiting, a, friend, ,, he, was, invite...",suggested,"[when, visiting, a, friend, ,, he, was, invite...",suggested,was
56,56,Female,1,"Lytle , the Confederate photographer , who had...",photographer,her,27,5,-1,perseus,...,3,"[Lytle, ,, the, Confederate, photographer, ,, ...",35,"[Lytle, ,, the, Confederate, photographer, ,, ...",",","[Lytle, ,, the, Confederate, photographer, ,, ...",",","[lytle, ,, the, confederate, photographer, ,, ...",",",when
57,58,Male,1,"He fell at that post , flag in hand , on the 1...",person,his,44,42,-1,perseus,...,4,"[He, fell, at, that, post, ,, flag, in, hand, ...",47,"[He, fell, at, that, post, ,, flag, in, hand, ...",flag,"[He, fell, at, that, post, ,, flag, in, hand, ...",flag,"[he, fell, at, that, post, ,, flag, in, hand, ...",flag,the
63,64,Male,1,"( d. 1837 ) , a New England physician , who mo...",physician,himself,19,10,1,perseus,...,1,"[(, d., 1837, ), ,, a, New, England, physician...",21,"[(, d., 1837, ), ,, a, New, England, physician...",who,"[(, d., 1837, ), ,, a, New, England, physician...",who,"[(, d., 1837, ), ,, a, new, england, physician...",who,bookseller
64,65,Male,1,In both his fields of scholarship â€” Chaucer ...,child,he,39,13,1,perseus,...,2,"[In, both, his, fields, of, scholarship, â€”, ...",67,"[In, both, his, fields, of, scholarship, â€”, ...",left,"[In, both, his, fields, of, scholarship, â€, ”...",”,"[in, both, his, fields, of, scholarship, â€”, ...",child,left


In [48]:
# heuristic to fix the 114 cases where the profession is not correct

def get_new_profession_index(row):
    index = row['profession_first_index']
    while row['spark_tokens'][index] != row['profession'].lower():
        index -= 1 # += 1 for balanced and full
    return index


def get_new_pronoun_index(row):
    index = row['g_first_index']
    while row['spark_tokens'][index].lower() != row['g'].lower():
        index -= 1 # += 1 for balanced and full
    return index


data['profession_first_index'] = data.progress_apply(get_new_profession_index, axis=1) # 
data['g_first_index'] = data.progress_apply(get_new_pronoun_index, axis=1)

100%|██████████| 1717/1717 [00:00<00:00, 50296.61it/s]
100%|██████████| 1717/1717 [00:00<00:00, 79830.84it/s]


In [49]:
# sanity check 
data['spark_profession'] =  data.progress_apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)  
data['spark_pronoun'] =  data.progress_apply(lambda row: row['spark_tokens'][row['g_first_index']], axis=1)
data[(data['spark_profession'] != data['profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape # bingo !

100%|██████████| 1717/1717 [00:00<00:00, 57101.33it/s]
100%|██████████| 1717/1717 [00:00<00:00, 102285.57it/s]


(0, 22)

In [50]:
data.head()

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,...,num_of_pronouns,split,length,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens,spark_profession,spark_pronoun
0,0,Male,1,"My friend , who grew up in Africa , explained ...",friend,he,16,1,0,covid19,...,2,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",29,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,"[my, friend, ,, who, grew, up, in, africa, ,, ...",friend,he
1,1,Male,1,"â€¢ Lastly , for the patient that did not need...",patient,his,17,5,0,covid19,...,1,"[â€¢, Lastly, ,, for, the, patient, that, did,...",29,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,"[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,"[â€¢, lastly, ,, for, the, patient, that, did,...",patient,his
2,2,Female,1,Her early years as a resident doctor in the No...,doctor,her,12,6,-1,covid19,...,2,"[Her, early, years, as, a, resident, doctor, i...",18,"[Her, early, years, as, a, resident, doctor, i...",doctor,"[Her, early, years, as, a, resident, doctor, i...",doctor,"[her, early, years, as, a, resident, doctor, i...",doctor,her
3,3,Female,1,"Another participant stated , "" Without network...",teacher,she,67,59,1,covid19,...,3,"[Another, participant, stated, ,, "", Without, ...",83,"[Another, participant, stated, ,, "", Without, ...",teacher,"[Another, participant, stated, ,, "", Without, ...",and,"[another, participant, stated, ,, "", without, ...",teacher,she
4,4,Male,1,The patient followed up in the nephrology clin...,patient,his,17,1,0,covid19,...,1,"[The, patient, followed, up, in, the, nephrolo...",31,"[The, patient, followed, up, in, the, nephrolo...",patient,"[The, patient, followed, up, in, the, nephrolo...",patient,"[the, patient, followed, up, in, the, nephrolo...",patient,his


In [51]:
data.drop(columns=["whitespace_token", "whitespace_profession", "spacy_profession", "spark_profession", "spark_pronoun"], inplace=True)

In [52]:
data.head()

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split,length,spacy_tokens,spark_tokens
0,0,Male,1,"My friend , who grew up in Africa , explained ...",friend,he,16,1,0,covid19,4,15,2,"[My, friend, ,, who, grew, up, in, Africa, ,, ...",29,"[My, friend, ,, who, grew, up, in, Africa, ,, ...","[my, friend, ,, who, grew, up, in, africa, ,, ..."
1,1,Male,1,"â€¢ Lastly , for the patient that did not need...",patient,his,17,5,0,covid19,6,12,1,"[â€¢, Lastly, ,, for, the, patient, that, did,...",29,"[â€¢, Lastly, ,, for, the, patient, that, did,...","[â€¢, lastly, ,, for, the, patient, that, did,..."
2,2,Female,1,Her early years as a resident doctor in the No...,doctor,her,12,6,-1,covid19,17,6,2,"[Her, early, years, as, a, resident, doctor, i...",18,"[Her, early, years, as, a, resident, doctor, i...","[her, early, years, as, a, resident, doctor, i..."
3,3,Female,1,"Another participant stated , "" Without network...",teacher,she,67,59,1,covid19,4,8,3,"[Another, participant, stated, ,, "", Without, ...",83,"[Another, participant, stated, ,, "", Without, ...","[another, participant, stated, ,, "", without, ..."
4,4,Male,1,The patient followed up in the nephrology clin...,patient,his,17,1,0,covid19,17,16,1,"[The, patient, followed, up, in, the, nephrolo...",31,"[The, patient, followed, up, in, the, nephrolo...","[the, patient, followed, up, in, the, nephrolo..."


In [53]:
## spark tokenization is not good because it automatically lowers all tokens 
## add alignment with spacy and get new token_id for both profession and pronoun 

In [54]:
# remove extra spaces and special tokens in the data
data['split_tokens'] = data['sentence_text'].str.split()
data['clean_sentence'] = data['split_tokens'].progress_apply(lambda x: ' '.join(x))
data[data['clean_sentence'] != data['sentence_text']].shape

100%|██████████| 1717/1717 [00:00<00:00, 380193.22it/s]


(8, 19)

In [55]:
# tokenize clean sentence with spacy

data['spacy_tokens'] = [[token.text for token in sent] for sent in tqdm(nlp.pipe(data['clean_sentence'], n_process=32), total=len(data))]

100%|██████████| 1717/1717 [00:01<00:00, 1265.85it/s]


In [56]:
# align spark tokens to spacy tokens

spacy2spark, spark2spacy = zip(*data.progress_apply(lambda row: tokenizations.get_alignments(row['spacy_tokens'], row['spark_tokens']), axis=1))
data['spark2spacy'] = spark2spacy
data['spacy2spark'] = spacy2spark

100%|██████████| 1717/1717 [00:00<00:00, 4301.22it/s]


In [57]:
# use alignment to get index of profession and pronoun according to spacy tokenization

data['spacy_profession_index'] = data.apply(lambda row: row['spark2spacy'][row['profession_first_index']][0], axis=1)
data['spacy_pronoun_index'] = data.apply(lambda row: row['spark2spacy'][row['g_first_index']][0], axis=1)

In [58]:
# sanity check that we get same profession and index with spacy after alignment 

data['spacy_profession'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_profession_index']], axis=1)  
data['spacy_pronoun'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_pronoun_index']], axis=1)
data[(data['spacy_profession'].str.lower() != data['profession'].str.lower()) | (data['g'].str.lower() != data['spacy_pronoun'].str.lower())].shape # bingo !

100%|██████████| 1717/1717 [00:00<00:00, 54732.29it/s]
100%|██████████| 1717/1717 [00:00<00:00, 97092.20it/s]


(0, 25)

In [59]:
data.head()



Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,...,spacy_tokens,spark_tokens,split_tokens,clean_sentence,spark2spacy,spacy2spark,spacy_profession_index,spacy_pronoun_index,spacy_profession,spacy_pronoun
0,0,Male,1,"My friend , who grew up in Africa , explained ...",friend,he,16,1,0,covid19,...,"[My, friend, ,, who, grew, up, in, Africa, ,, ...","[my, friend, ,, who, grew, up, in, africa, ,, ...","[My, friend, ,, who, grew, up, in, Africa, ,, ...","My friend , who grew up in Africa , explained ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,16,friend,he
1,1,Male,1,"â€¢ Lastly , for the patient that did not need...",patient,his,17,5,0,covid19,...,"[â€¢, Lastly, ,, for, the, patient, that, did,...","[â€¢, lastly, ,, for, the, patient, that, did,...","[â€¢, Lastly, ,, for, the, patient, that, did,...","â€¢ Lastly , for the patient that did not need...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",5,17,patient,his
2,2,Female,1,Her early years as a resident doctor in the No...,doctor,her,12,6,-1,covid19,...,"[Her, early, years, as, a, resident, doctor, i...","[her, early, years, as, a, resident, doctor, i...","[Her, early, years, as, a, resident, doctor, i...",Her early years as a resident doctor in the No...,"[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",6,12,doctor,her
3,3,Female,1,"Another participant stated , "" Without network...",teacher,she,67,59,1,covid19,...,"[Another, participant, stated, ,, "", Without, ...","[another, participant, stated, ,, "", without, ...","[Another, participant, stated, ,, "", Without, ...","Another participant stated , "" Without network...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",61,69,teacher,she
4,4,Male,1,The patient followed up in the nephrology clin...,patient,his,17,1,0,covid19,...,"[The, patient, followed, up, in, the, nephrolo...","[the, patient, followed, up, in, the, nephrolo...","[The, patient, followed, up, in, the, nephrolo...",The patient followed up in the nephrology clin...,"[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,17,patient,his


In [60]:
data.columns

Index(['uid', 'predicted gender', 'correct', 'sentence_text', 'profession',
       'g', 'g_first_index', 'profession_first_index', 'stereotype', 'corpus',
       'data_index', 'distance', 'num_of_pronouns', 'split', 'length',
       'spacy_tokens', 'spark_tokens', 'split_tokens', 'clean_sentence',
       'spark2spacy', 'spacy2spark', 'spacy_profession_index',
       'spacy_pronoun_index', 'spacy_profession', 'spacy_pronoun'],
      dtype='object')

In [61]:
data = data[["uid", "clean_sentence", "spacy_tokens", "spacy_profession", "spacy_pronoun", "spacy_profession_index",
            "spacy_pronoun_index", "predicted gender", "stereotype", "distance", "num_of_pronouns",
            "corpus", "data_index"]].rename(columns={
                "clean_sentence": "sentence_text",
                "spacy_tokens": "tokens",
                "spacy_profession_index": "profession_first_index",
                "spacy_profession": "profession",
                "spacy_pronoun_index": "g_first_index",
                "spacy_pronoun": "g"})

In [62]:
data.to_csv('../data_v2/gold_BUG.csv')

In [63]:
data.head()

Unnamed: 0,uid,sentence_text,tokens,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
0,0,"My friend , who grew up in Africa , explained ...","[My, friend, ,, who, grew, up, in, Africa, ,, ...",friend,he,1,16,Male,0,15,2,covid19,4
1,1,"â€¢ Lastly , for the patient that did not need...","[â€¢, Lastly, ,, for, the, patient, that, did,...",patient,his,5,17,Male,0,12,1,covid19,6
2,2,Her early years as a resident doctor in the No...,"[Her, early, years, as, a, resident, doctor, i...",doctor,her,6,12,Female,-1,6,2,covid19,17
3,3,"Another participant stated , "" Without network...","[Another, participant, stated, ,, "", Without, ...",teacher,she,61,69,Female,1,8,3,covid19,4
4,4,The patient followed up in the nephrology clin...,"[The, patient, followed, up, in, the, nephrolo...",patient,his,1,17,Male,0,16,1,covid19,17


In [26]:
'''
:-----:|------------------------|--------------------------------------------
1      | sentence_text          | Text of sentences with a human entity, identified by their profession and a gendered pronoun
2      | profession             | The entity in the sentence
3      | g                      | The pronoun in the sentence
4      | profession_first_index | Words offset of profession in sentence
5      | g_first_index          | Words offset of pronoun in sentence
6      | predicted gender       | 'male'/'female' determined by the pronoun
7      | stereotype             | -1/0/1 for anti-stereotype, neutral and stereotype sentence
8      | distance               | The abs distance in words between pronoun and profession
9      | num_of_pronouns        | Number of pronouns in the sentence
10     | corpus                 | The corpus from which the sentence is taken
11     | data_index 

'''

"\n:-----:|------------------------|--------------------------------------------\n1      | sentence_text          | Text of sentences with a human entity, identified by their profession and a gendered pronoun\n2      | profession             | The entity in the sentence\n3      | g                      | The pronoun in the sentence\n4      | profession_first_index | Words offset of profession in sentence\n5      | g_first_index          | Words offset of pronoun in sentence\n6      | predicted gender       | 'male'/'female' determined by the pronoun\n7      | stereotype             | -1/0/1 for anti-stereotype, neutral and stereotype sentence\n8      | distance               | The abs distance in words between pronoun and profession\n9      | num_of_pronouns        | Number of pronouns in the sentence\n10     | corpus                 | The corpus from which the sentence is taken\n11     | data_index \n\n"