In [1]:
import jsonlines 
import os 
import pandas as pd 
from collections import Counter 
import spacy
from tqdm import tqdm 
import tokenizations

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.sql import SparkSession, Row

In [2]:
tqdm.pandas()

In [3]:
os.listdir('../data')

['._balanced_BUG.csv', 'balanced_BUG.csv', 'gold_BUG.csv', 'full_BUG.csv']

In [4]:
full = pd.read_csv('../data/full_BUG.csv')
gold = pd.read_csv('../data/gold_BUG.csv')
balanced = pd.read_csv('../data/balanced_BUG.csv')

In [5]:
full.shape 

(105687, 12)

In [6]:
full.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1


# Tokenize data

In [7]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', 'lemmatizer', 'textcat'])

In [8]:
def safe_list_get(l, idx, default=''):
  try:
    return l[idx]
  except IndexError:
    return default

In [9]:
# tokenize with whitespace 

full["whitespace_token"] = full['sentence_text'].progress_apply(lambda x: x.split())
full['whitespace_profession'] = full.progress_apply(lambda row: safe_list_get(row['whitespace_token'], row['profession_first_index']), axis=1)
full[full['profession'] != full['whitespace_profession']].shape

100%|██████████| 105687/105687 [00:00<00:00, 198609.91it/s]
100%|██████████| 105687/105687 [00:01<00:00, 96439.61it/s]


(4226, 14)

In [10]:
# tokenize with spacy

full['spacy_tokens'] = [[token.text for token in sent] for sent in tqdm(nlp.pipe(full['sentence_text']), total=len(full))]
full['spacy_profession'] = full.progress_apply(lambda row: row['spacy_tokens'][row['profession_first_index']], axis=1)
full[full['profession'] != full['spacy_profession']].shape 

100%|██████████| 105687/105687 [01:41<00:00, 1042.77it/s]
100%|██████████| 105687/105687 [00:01<00:00, 89722.84it/s] 


(7718, 16)

In [11]:
# tokenize with Spark 

spark = SparkSession.builder.appName("Tokenizer").getOrCreate()
sentenceData = spark.createDataFrame(full)
tokenizer = Tokenizer(inputCol="sentence_text", outputCol="spark_tokens")
wordsData = tokenizer.transform(sentenceData)
data = wordsData.toPandas()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/15 18:18:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/15 18:19:00 WARN TaskSetManager: Stage 0 contains a task of very large size (1383 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[patient, number, 2, was, isolated, with, his,..."
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[five, days, post, admission, to, the, ccu, ,,..."
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...",patient,"[One, patient, whose, fascial, layers, were, c...",patient,"[one, patient, whose, fascial, layers, were, c..."
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...",patient,"[The, patient, was, discharged, 18, days, afte...",patient,"[the, patient, was, discharged, 18, days, afte..."
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...",PATIENT,"[patient, concerns, a, 24, year-old, male, was..."


In [13]:
data['spark_profession'] = data.apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)
data['spark_pronoun'] = data.apply(lambda row: row['spark_tokens'][row['g_first_index']].lower(), axis=1)
data[(data['profession'] != data['spark_profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape 

(114, 19)

In [14]:
# heuristic to fix the 114 cases where the profession is not correct

def get_new_profession_index(row):
    index = row['profession_first_index']
    while row['spark_tokens'][index] != row['profession'].lower():
        index += 1
    return index


def get_new_pronoun_index(row):
    index = row['g_first_index']
    while row['spark_tokens'][index].lower() != row['g'].lower():
        index += 1
    return index


data['profession_first_index'] = data.progress_apply(get_new_profession_index, axis=1) # 
data['g_first_index'] = data.progress_apply(get_new_pronoun_index, axis=1)

100%|██████████| 105687/105687 [00:01<00:00, 69030.95it/s]
100%|██████████| 105687/105687 [00:01<00:00, 64116.58it/s]


In [15]:
# sanity check 
data['spark_profession'] =  data.progress_apply(lambda row: row['spark_tokens'][row['profession_first_index']], axis=1)  
data['spark_pronoun'] =  data.progress_apply(lambda row: row['spark_tokens'][row['g_first_index']], axis=1)
data[(data['spark_profession'] != data['profession']) | (data['g'].str.lower() != data['spark_pronoun'])].shape # bingo !

100%|██████████| 105687/105687 [00:01<00:00, 68249.64it/s]
100%|██████████| 105687/105687 [00:01<00:00, 86148.17it/s]


(0, 19)

In [16]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,whitespace_token,whitespace_profession,spacy_tokens,spacy_profession,spark_tokens,spark_profession,spark_pronoun
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[Patient, number, 2, was, isolated, with, his,...",Patient,"[patient, number, 2, was, isolated, with, his,...",patient,his
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[Five, days, post, admission, to, the, CCU, ,,...",patient,"[five, days, post, admission, to, the, ccu, ,,...",patient,his
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...",patient,"[One, patient, whose, fascial, layers, were, c...",patient,"[one, patient, whose, fascial, layers, were, c...",patient,her
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...",patient,"[The, patient, was, discharged, 18, days, afte...",patient,"[the, patient, was, discharged, 18, days, afte...",patient,his
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year-old, male, was...",PATIENT,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...",PATIENT,"[patient, concerns, a, 24, year-old, male, was...",patient,his


In [17]:
data.drop(columns=["whitespace_token", "whitespace_profession", "spacy_profession", "spark_profession", "spark_pronoun"], inplace=True)

In [18]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,spacy_tokens,spark_tokens
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...","[patient, number, 2, was, isolated, with, his,..."
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...","[five, days, post, admission, to, the, ccu, ,,..."
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...","[one, patient, whose, fascial, layers, were, c..."
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...","[the, patient, was, discharged, 18, days, afte..."
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...","[patient, concerns, a, 24, year-old, male, was..."


In [19]:
## spark tokenization is not good because it automatically lowers all tokens 
## add alignment with spacy and get new token_id for both profession and pronoun 

In [20]:
# align spark tokens to spacy tokens

spacy2spark, spark2spacy = zip(*data.progress_apply(lambda row: tokenizations.get_alignments(row['spacy_tokens'], row['spark_tokens']), axis=1))
data['spark2spacy'] = spark2spacy
data['spacy2spark'] = spacy2spark

100%|██████████| 105687/105687 [00:14<00:00, 7172.81it/s] 


In [21]:
# use alignment to get index of profession and pronoun according to spacy tokenization

data['spacy_profession_index'] = data.apply(lambda row: row['spark2spacy'][row['profession_first_index']][0], axis=1)
data['spacy_pronoun_index'] = data.apply(lambda row: row['spark2spacy'][row['g_first_index']][0], axis=1)

In [22]:
# sanity check that we get same profession and index with spacy after alignment 

data['spacy_profession'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_profession_index']], axis=1)  
data['spacy_pronoun'] =  data.progress_apply(lambda row: row['spacy_tokens'][row['spacy_pronoun_index']], axis=1)
data[(data['spacy_profession'].str.lower() != data['profession'].str.lower()) | (data['g'].str.lower() != data['spacy_pronoun'].str.lower())].shape # bingo !

100%|██████████| 105687/105687 [00:01<00:00, 100705.58it/s]
100%|██████████| 105687/105687 [00:01<00:00, 97986.90it/s]


(0, 20)

In [23]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_text,profession,g,profession_first_index,g_first_index,predicted gender,stereotype,distance,num_of_pronouns,corpus,data_index,spacy_tokens,spark_tokens,spark2spacy,spacy2spark,spacy_profession_index,spacy_pronoun_index,spacy_profession,spacy_pronoun
0,101,Patient number 2 was isolated with his wife th...,patient,his,0,6,male,0,6,1,covid19,1,"[Patient, number, 2, was, isolated, with, his,...","[patient, number, 2, was, isolated, with, his,...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",0,6,Patient,his
1,118,"Five days post admission to the CCU , the pati...",patient,his,9,14,male,0,5,1,covid19,1,"[Five, days, post, admission, to, the, CCU, ,,...","[five, days, post, admission, to, the, ccu, ,,...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",9,14,patient,his
2,190,One patient whose fascial layers were closed i...,patient,her,1,15,female,0,14,1,covid19,1,"[One, patient, whose, fascial, layers, were, c...","[one, patient, whose, fascial, layers, were, c...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,15,patient,her
3,281,The patient was discharged 18 days after his a...,patient,his,1,7,male,0,6,1,covid19,1,"[The, patient, was, discharged, 18, days, afte...","[the, patient, was, discharged, 18, days, afte...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...","[[0], [1], [2], [3], [4], [5], [6], [7], [8], ...",1,7,patient,his
4,317,PATIENT CONCERNS A 24 year-old male was referr...,patient,his,0,10,male,0,10,1,covid19,1,"[PATIENT, CONCERNS, A, 24, year, -, old, male,...","[patient, concerns, a, 24, year-old, male, was...","[[0], [1], [2], [3], [4, 5, 6], [7], [8], [9],...","[[0], [1], [2], [3], [4], [4], [4], [5], [6], ...",0,12,PATIENT,his
