In [1]:
import jsonlines 
import os 
import pandas as pd 
from collections import Counter 
import spacy
from tqdm import tqdm 
import tokenizations

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.sql import SparkSession, Row

In [2]:
tqdm.pandas()

In [3]:
os.listdir('../data')

['._balanced_BUG.csv', 'balanced_BUG.csv', 'gold_BUG.csv', 'full_BUG.csv']

In [106]:
full = pd.read_csv('../data/full_BUG.csv')
gold = pd.read_csv('../data/gold_BUG.csv')
balanced = pd.read_csv('../data/balanced_BUG.csv')

## Check sentences in BUG gold

In [109]:
gold[gold['sentence_text'].isna()]

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split
645,645,Male,1,,officer,himself,20,15,1,wikipedia,3,5,1,


In [110]:
full[(full["g_first_index"] == 20) & (full["profession_first_index"] == 15) & (full['profession'] == "officer") & (full["data_index"] == 3)]

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
94343,27232,German journalists Ansgar Graw and Frank Herma...,officer,himself,15,20,male,1,5,1,wikipedia,3


In [111]:
gold.at[645, "sentence_text"] = full.loc[94343].sentence_text

In [163]:
gold['split'] = gold['sentence_text'].str.split()
gold['length'] = gold['split'].apply(len)
gold[gold['length'] - 1 < gold['g_first_index']]

Unnamed: 0,uid,predicted gender,correct,sentence_text,profession,g,g_first_index,profession_first_index,stereotype,corpus,data_index,distance,num_of_pronouns,split,length
20,20,Male,1,Only one patient had a QTc,patient,he,21,2,0,covid19,11,19,1,"[Only, one, patient, had, a, QTc]",6
1228,1228,Female,1,The patient experienced no grade,patient,she,22,1,0,pubmed,11,21,1,"[The, patient, experienced, no, grade]",5
1643,1643,Male,1,"SD!""> <ent",undertaker,he,20,2,1,wikipedia,11,18,1,"[SD!"">, <ent]",2


In [164]:
full[(full['profession'] == "patient") & (full['g_first_index'] == 21) & 
(full["profession_first_index"] == 2) & (full["g"] == "he")
& (full["data_index"] == 11)]
# full.loc[320]['sentence_text']

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession
320,1600,"Only one patient had a QTc > 500 ms , which is...",patient,he,2,21,male,0,19,1,covid19,11,"[Only, one, patient, had, a, QTc, >, 500, ms, ...",patient,"[Only, one, patient, had, a, QTc, >, 500, ms, ...",patient
5403,6094,"The second patient , a 66-year-old male , deve...",patient,he,2,21,male,0,19,1,pubmed,11,"[The, second, patient, ,, a, 66-year-old, male...",patient,"[The, second, patient, ,, a, 66, -, year, -, o...",patient
5923,12312,In one patient fatal ventricular fibrillation ...,patient,he,2,21,male,0,19,2,pubmed,11,"[In, one, patient, fatal, ventricular, fibrill...",patient,"[In, one, patient, fatal, ventricular, fibrill...",patient


In [165]:
full[(full['profession'] == "patient") & (full['g_first_index'] == 22) & 
(full["profession_first_index"] == 1) & (full["g"] == "she")
& (full["data_index"] == 11)]

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession


In [166]:
full[(full["sentence_text"].str.startswith("The patient experienced")) & (full["g"].str.lower() == "she")
& (full['data_index'] == 11)]

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession
467,3539,The patient experienced intermittent chest tig...,patient,she,1,13,female,0,12,1,covid19,11,"[The, patient, experienced, intermittent, ches...",patient,"[The, patient, experienced, intermittent, ches...",patient
5270,4494,The patient experienced prolonged lymphadeniti...,patient,she,1,12,female,0,11,1,pubmed,11,"[The, patient, experienced, prolonged, lymphad...",patient,"[The, patient, experienced, prolonged, lymphad...",patient
5924,12323,The patient experienced multiple admissions fo...,patient,she,1,10,female,0,9,1,pubmed,11,"[The, patient, experienced, multiple, admissio...",patient,"[The, patient, experienced, multiple, admissio...",patient
5925,12336,The patient experienced a cardiogenic shock wi...,patient,she,1,13,female,0,12,1,pubmed,11,"[The, patient, experienced, a, cardiogenic, sh...",patient,"[The, patient, experienced, a, cardiogenic, sh...",patient
6797,20961,The patient experienced full recovery postoper...,patient,she,1,14,female,0,13,1,pubmed,11,"[The, patient, experienced, full, recovery, po...",patient,"[The, patient, experienced, full, recovery, po...",patient
6940,22304,The patient experienced full recovery post ope...,patient,she,1,15,female,0,14,1,pubmed,11,"[The, patient, experienced, full, recovery, po...",patient,"[The, patient, experienced, full, recovery, po...",patient


In [113]:
full.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1


In [119]:
full.loc[38]['sentence_text']

'So yes , that was humbling . Addressing the effects of stigma against HCWs My healthcare assistant was told by her landlord to pack up and leave on a Friday afternoon , after he found out that she worked in our hospital .'

In [142]:
# fix issue for pipeline 

full.at[38, "sentence_text"] = 'So yes , that was humbling . Addressing the effects of stigma against HCWs My healthcare assistant was told by her landlord to pack up and leave on a Friday afternoon , after he found out that she worked in our hospital .'
full.at[38, "profession_first_index"] = 16
full.at[38, "g_first_index"] = 20

# Tokenize data

In [143]:
df = full 
df.shape 

(105687, 16)

In [144]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[Patient, number, 2, was, isolated, with, his,...",Patient
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[Five, days, post, admission, to, the, CCU, ,,...",patient
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...",patient,"[One, patient, whose, fascial, layers, were, c...",patient
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...",patient,"[The, patient, was, discharged, 18, days, afte...",patient
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...",PATIENT


In [126]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'lemmatizer', 'textcat'])

In [127]:
def safe_list_get(l, idx, default=''):
  try:
    return l[idx]
  except IndexError:
    return default

In [128]:
# tokenize with whitespace 

df["whitespace_token"] = df['sentence_text'].progress_apply(lambda x: x.split())
df['whitespace_profession'] = df.progress_apply(lambda row: safe_list_get(row['whitespace_token'], row['profession_first_index']), axis=1)
df[df['profession'] != df['whitespace_profession']].shape

100%|██████████| 105687/105687 [00:00<00:00, 281807.10it/s]
100%|██████████| 105687/105687 [00:00<00:00, 107937.25it/s]


(4227, 14)

In [130]:
# tokenize with spacy

df['spacy_tokens'] = [[token.text for token in sent] for sent in tqdm(nlp.pipe(df['sentence_text'], n_process=32), total=len(df))]
df['spacy_profession'] = df.progress_apply(lambda row: safe_list_get(row['spacy_tokens'], row['profession_first_index']), axis=1)
df[df['profession'] != df['spacy_profession']].shape 

100%|██████████| 105687/105687 [00:53<00:00, 1987.65it/s]
100%|██████████| 105687/105687 [00:01<00:00, 74629.54it/s]


(7719, 16)

In [145]:
# tokenize with Spark 

spark = SparkSession.builder.appName("Tokenizer").getOrCreate()
sentenceData = spark.createDataFrame(df)
tokenizer = Tokenizer(inputCol="sentence_text", outputCol="spark_tokens")
wordsData = tokenizer.transform(sentenceData)
data = wordsData.toPandas()

22/03/16 15:28:10 WARN TaskSetManager: Stage 5 contains a task of very large size (1383 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [146]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[patient, number, 2, was, isolated, with, his,..."
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[five, days, post, admission, to, the, ccu, ,,..."
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...",patient,"[One, patient, whose, fascial, layers, were, c...",patient,"[one, patient, whose, fascial, layers, were, c..."
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...",patient,"[The, patient, was, discharged, 18, days, afte...",patient,"[the, patient, was, discharged, 18, days, afte..."
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...",PATIENT,"[patient, concerns, a, 24, year-old, male, was..."


In [147]:
data['spark_profession'] = data.apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)
data['spark_pronoun'] = data.apply(lambda row: row['spark_tokens'][row['g_first_index']].lower(), axis=1)

data[(data['profession'] != data['spark_profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape 

(114, 19)

In [148]:
# heuristic to fix the 114 cases where the profession is not correct

def get_new_profession_index(row):
    index = row['profession_first_index']
    while row['spark_tokens'][index] != row['profession'].lower():
        index += 1
    return index


def get_new_pronoun_index(row):
    index = row['g_first_index']
    while row['spark_tokens'][index].lower() != row['g'].lower():
        index += 1
    return index


data['profession_first_index'] = data.progress_apply(get_new_profession_index, axis=1) # 
data['g_first_index'] = data.progress_apply(get_new_pronoun_index, axis=1)

100%|██████████| 105687/105687 [00:01<00:00, 70527.83it/s]
100%|██████████| 105687/105687 [00:01<00:00, 67628.44it/s]


In [149]:
# sanity check 
data['spark_profession'] =  data.progress_apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)  
data['spark_pronoun'] =  data.progress_apply(lambda row: row['spark_tokens'][row['g_first_index']], axis=1)
data[(data['spark_profession'] != data['profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape # bingo !

100%|██████████| 105687/105687 [00:01<00:00, 79785.19it/s]
100%|██████████| 105687/105687 [00:01<00:00, 86590.71it/s]


(0, 19)

In [150]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens,spark_profession,spark_pronoun
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[patient, number, 2, was, isolated, with, his,...",patient,his
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[five, days, post, admission, to, the, ccu, ,,...",patient,his
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...",patient,"[One, patient, whose, fascial, layers, were, c...",patient,"[one, patient, whose, fascial, layers, were, c...",patient,her
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...",patient,"[The, patient, was, discharged, 18, days, afte...",patient,"[the, patient, was, discharged, 18, days, afte...",patient,his
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...",PATIENT,"[patient, concerns, a, 24, year-old, male, was...",patient,his


In [151]:
data.drop(columns=["whitespace_token", "whitespace_profession", "spacy_profession", "spark_profession", "spark_pronoun"], inplace=True)

In [152]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,spacy_tokens,spark_tokens
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...","[patient, number, 2, was, isolated, with, his,..."
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...","[five, days, post, admission, to, the, ccu, ,,..."
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...","[one, patient, whose, fascial, layers, were, c..."
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...","[the, patient, was, discharged, 18, days, afte..."
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...","[patient, concerns, a, 24, year-old, male, was..."


In [153]:
## spark tokenization is not good because it automatically lowers all tokens 
## add alignment with spacy and get new token_id for both profession and pronoun 

In [154]:
# remove extra spaces and special tokens in the data
data['split_tokens'] = data['sentence_text'].str.split()
data['clean_sentence'] = data['split_tokens'].progress_apply(lambda x: ' '.join(x))
data[data['clean_sentence'] != data['sentence_text']].shape

100%|██████████| 105687/105687 [00:00<00:00, 668504.62it/s]


(1752, 16)

In [155]:
# tokenize clean sentence with spacy

data['spacy_tokens'] = [[token.text for token in sent] for sent in tqdm(nlp.pipe(data['clean_sentence'], n_process=32), total=len(data))]

100%|██████████| 105687/105687 [00:51<00:00, 2058.01it/s]


In [156]:
# align spark tokens to spacy tokens

spacy2spark, spark2spacy = zip(*data.progress_apply(lambda row: tokenizations.get_alignments(row['spacy_tokens'], row['spark_tokens']), axis=1))
data['spark2spacy'] = spark2spacy
data['spacy2spark'] = spacy2spark

100%|██████████| 105687/105687 [00:12<00:00, 8403.29it/s] 


In [157]:
# use alignment to get index of profession and pronoun according to spacy tokenization

data['spacy_profession_index'] = data.apply(lambda row: row['spark2spacy'][row['profession_first_index']][0], axis=1)
data['spacy_pronoun_index'] = data.apply(lambda row: row['spark2spacy'][row['g_first_index']][0], axis=1)

In [158]:
# sanity check that we get same profession and index with spacy after alignment 

data['spacy_profession'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_profession_index']], axis=1)  
data['spacy_pronoun'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_pronoun_index']], axis=1)
data[(data['spacy_profession'].str.lower() != data['profession'].str.lower()) | (data['g'].str.lower() != data['spacy_pronoun'].str.lower())].shape # bingo !

100%|██████████| 105687/105687 [00:01<00:00, 89236.31it/s] 
100%|██████████| 105687/105687 [00:01<00:00, 78857.11it/s] 


(0, 22)

In [159]:
data.head()



Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,...,spacy_tokens,spark_tokens,split_tokens,clean_sentence,spark2spacy,spacy2spark,spacy_profession_index,spacy_pronoun_index,spacy_profession,spacy_pronoun
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,...,"[Patient, number, 2, was, isolated, with, his,...","[patient, number, 2, was, isolated, with, his,...","[Patient, number, 2, was, isolated, with, his,...",Patient number 2 was isolated with his wife th...,"[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",0,6,Patient,his
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,...,"[Five, days, post, admission, to, the, CCU, ,,...","[five, days, post, admission, to, the, ccu, ,,...","[Five, days, post, admission, to, the, CCU, ,,...","Five days post admission to the CCU , the pati...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",9,14,patient,his
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,...,"[One, patient, whose, fascial, layers, were, c...","[one, patient, whose, fascial, layers, were, c...","[One, patient, whose, fascial, layers, were, c...",One patient whose fascial layers were closed i...,"[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,15,patient,her
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,...,"[The, patient, was, discharged, 18, days, afte...","[the, patient, was, discharged, 18, days, afte...","[The, patient, was, discharged, 18, days, afte...",The patient was discharged 18 days after his a...,"[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,7,patient,his
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,...,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...","[patient, concerns, a, 24, year-old, male, was...","[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT CONCERNS A 24 year-old male was referr...,"[[0], [1], [2], [3], [4, 5, 6], [7], [8], [9],...","[[0], [1], [2], [3], [4], [4], [4], [5], [6], ...",0,12,PATIENT,his


In [160]:
data.columns

Index(['Unnamed: 0', 'sentence_text', 'profession', 'g',
       'profession_first_index', 'g_first_index', 'predicted gender',
       'stereotype', 'distance', 'num_of_pronouns', 'corpus', 'data_index',
       'spacy_tokens', 'spark_tokens', 'split_tokens', 'clean_sentence',
       'spark2spacy', 'spacy2spark', 'spacy_profession_index',
       'spacy_pronoun_index', 'spacy_profession', 'spacy_pronoun'],
      dtype='object')

In [161]:
data = data[["clean_sentence", "spacy_tokens", "spacy_profession", "spacy_pronoun", "spacy_profession_index",
            "spacy_pronoun_index", "predicted gender", "stereotype", "distance", "num_of_pronouns",
            "corpus", "data_index"]].rename(columns={
                "clean_sentence": "sentence_text",
                "spacy_tokens": "tokens",
                "spacy_profession_index": "profession_first_index",
                "spacy_profession": "profession",
                "spacy_pronoun_index": "g_first_index",
                "spacy_pronoun": "g"})

In [162]:
data.to_csv('../data_v2/full_BUG.csv')

In [63]:
data.head()

Unnamed: 0,clean_sentence,tokens,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
0,Notice of contact depends on the time the infe...,"[Notice, of, contact, depends, on, the, time, ...",person,her,9,13,female,1,4,1,covid19,1
1,Hepaprotective agent glycyrrhizinate was added...,"[Hepaprotective, agent, glycyrrhizinate, was, ...",agent,her,1,6,female,1,5,1,covid19,1
2,"So yes , that was humbling . "" Addressing the ...","[So, yes, ,, that, was, humbling, ., "", Addres...",assistant,her,17,21,female,1,4,3,covid19,1
3,Following previous works based on classical pr...,"[Following, previous, works, based, on, classi...",agent,her,23,30,female,1,7,1,covid19,1
4,The second aspect is the agent neighborhood ! ...,"[The, second, aspect, is, the, agent, neighbor...",agent,her,5,20,female,1,15,1,covid19,14


In [26]:
'''
:-----:|------------------------|--------------------------------------------
1      | sentence_text          | Text of sentences with a human entity, identified by their profession and a gendered pronoun
2      | profession             | The entity in the sentence
3      | g                      | The pronoun in the sentence
4      | profession_first_index | Words offset of profession in sentence
5      | g_first_index          | Words offset of pronoun in sentence
6      | predicted gender       | 'male'/'female' determined by the pronoun
7      | stereotype             | -1/0/1 for anti-stereotype, neutral and stereotype sentence
8      | distance               | The abs distance in words between pronoun and profession
9      | num_of_pronouns        | Number of pronouns in the sentence
10     | corpus                 | The corpus from which the sentence is taken
11     | data_index 

'''

"\n:-----:|------------------------|--------------------------------------------\n1      | sentence_text          | Text of sentences with a human entity, identified by their profession and a gendered pronoun\n2      | profession             | The entity in the sentence\n3      | g                      | The pronoun in the sentence\n4      | profession_first_index | Words offset of profession in sentence\n5      | g_first_index          | Words offset of pronoun in sentence\n6      | predicted gender       | 'male'/'female' determined by the pronoun\n7      | stereotype             | -1/0/1 for anti-stereotype, neutral and stereotype sentence\n8      | distance               | The abs distance in words between pronoun and profession\n9      | num_of_pronouns        | Number of pronouns in the sentence\n10     | corpus                 | The corpus from which the sentence is taken\n11     | data_index \n\n"