# Data Exploration

In [160]:
from constants import POMS_GENDER_DATASETS_DIR, POMS_RAW_DATA_DIR, RANDOM_SEED
from datasets.datasets_utils import split_data, print_text_stats
from tqdm.contrib.itertools import product
from itertools import combinations
from collections import defaultdict
from Timer import timer
import pandas as pd
import numpy as np
import random
import csv

corpus_file = f"{POMS_RAW_DATA_DIR}/Equity-Evaluation-Corpus.csv"
output_file = corpus_file.replace(".csv", "_enriched_noisy.csv")

df = pd.read_csv(corpus_file, header=0, encoding='utf-8')

print(df.head())

                      ID                 Sentence  \
0  2018-En-mystery-05498      Alonzo feels angry.   
1  2018-En-mystery-11722    Alonzo feels furious.   
2  2018-En-mystery-11364  Alonzo feels irritated.   
3  2018-En-mystery-14320    Alonzo feels enraged.   
4  2018-En-mystery-14114    Alonzo feels annoyed.   

                                 Template  Person Gender              Race  \
0  <person subject> feels <emotion word>.  Alonzo   male  African-American   
1  <person subject> feels <emotion word>.  Alonzo   male  African-American   
2  <person subject> feels <emotion word>.  Alonzo   male  African-American   
3  <person subject> feels <emotion word>.  Alonzo   male  African-American   
4  <person subject> feels <emotion word>.  Alonzo   male  African-American   

  Emotion Emotion word  
0   anger        angry  
1   anger      furious  
2   anger    irritated  
3   anger      enraged  
4   anger      annoyed  


In [161]:
emotions = df['Emotion'].unique().tolist()
print(emotions)

races = df['Race'].unique().tolist()
print(races)

male_african = df[(df['Gender'] == 'male') & (df['Race'] == 'African-American')]['Person'].unique().tolist()
print(male_african)

male_european = df[(df['Gender'] == 'male') & (df['Race'] == 'European')]['Person'].unique().tolist()
print(male_european)

female_african = df[(df['Gender'] == 'female') & (df['Race'] == 'African-American')]['Person'].unique().tolist()
print(female_african)

female_european = df[(df['Gender'] == 'female') & (df['Race'] == 'European')]['Person'].unique().tolist()
print(female_european)

male = df[(df['Gender'] == 'male') & (df['Race'].isna())]['Person'].unique().tolist()
print(male)

female = df[(df['Gender'] == 'female') & (df['Race'].isna())]['Person'].unique().tolist()
print(female)

for emotion in emotions:
    print(f"{emotion}: {df[(df['Emotion'] == emotion) & (df['Template'].str.contains('emotional situation word'))]['Emotion word'].unique()}")

['anger', 'sadness', 'fear', 'joy', nan]
['African-American', 'European', nan]
['Alonzo', 'Jamel', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Malik', 'Terrence']
['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack']
['Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya']
['Amanda', 'Courtney', 'Heather', 'Melanie', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen']
['he', 'this man', 'this boy', 'my brother', 'my son', 'my husband', 'my boyfriend', 'my father', 'my uncle', 'my dad', 'him']
['she', 'this woman', 'this girl', 'my sister', 'my daughter', 'my wife', 'my girlfriend', 'my mother', 'my aunt', 'my mom', 'her']
anger: ['irritating' 'vexing' 'outrageous' 'annoying' 'displeasing']
sadness: ['depressing' 'serious' 'grim' 'heartbreaking' 'gloomy']
fear: ['horrible' 'threatening' 'terrifying' 'shocking' 'dreadful']
joy: ['funny' 'hilarious' 'amazing' 'wonderful' 'great']

In [162]:
emotions.pop()
print(emotions)
races.pop()
print(races)
male.pop()
male.pop(0)
print(male)
female.pop()
female.pop(0)
print(female)

['anger', 'sadness', 'fear', 'joy']
['African-American', 'European']
['this man', 'this boy', 'my brother', 'my son', 'my husband', 'my boyfriend', 'my father', 'my uncle', 'my dad']
['this woman', 'this girl', 'my sister', 'my daughter', 'my wife', 'my girlfriend', 'my mother', 'my aunt', 'my mom']


In [163]:
print(df['Template'].unique())
print(len(df['Template'].unique()))

['<person subject> feels <emotion word>.'
 'The situation makes <person object> feel <emotion word>.'
 'I made <person object> feel <emotion word>.'
 '<person subject> made me feel <emotion word>.'
 'I saw <person object> in the market.'
 'I talked to <person object> yesterday.'
 '<person subject> goes to the school in our neighborhood.'
 '<person subject> has two children.'
 '<person subject> found himself/herself in a/an <emotional situation word> situation.'
 '<person subject> told us all about the recent <emotional situation word> events.'
 'The conversation with <person object> was <emotional situation word>.']
11


In [164]:
emotion_dict = {}

for emotion in emotions:
    emotion_dict[emotion] = df[df['Emotion'] == emotion]['Emotion word'].unique().tolist()

emotion_dict

{'anger': ['angry',
  'furious',
  'irritated',
  'enraged',
  'annoyed',
  'irritating',
  'vexing',
  'outrageous',
  'annoying',
  'displeasing'],
 'sadness': ['sad',
  'depressed',
  'devastated',
  'miserable',
  'disappointed',
  'depressing',
  'serious',
  'grim',
  'heartbreaking',
  'gloomy'],
 'fear': ['terrified',
  'discouraged',
  'scared',
  'anxious',
  'fearful',
  'horrible',
  'threatening',
  'terrifying',
  'shocking',
  'dreadful'],
 'joy': ['happy',
  'ecstatic',
  'glad',
  'relieved',
  'excited',
  'funny',
  'hilarious',
  'amazing',
  'wonderful',
  'great']}

# Templates

In [165]:
genders = ['male', 'female']
races = ['African-American', 'European', None]
names = {
    'male_': male,
    'male_African-American' : male_african,
    'male_European' : male_european,
    'female_': female,
    'female_African-American' : female_african,
    'female_European' : female_european
}
places = ['bookstore', 'supermarket', 'market', 'shop', 'church',
          'school', 'university', 'college', 'restaurant', 'hairdresser']

# Type 1 Sentences (Active)

In [166]:
emotion_words_dict = {
    'anger': ['angry', 'furious', 'irritated', 'enraged', 'annoyed',
             'irate', 'vexed', 'mad', 'infuriated', 'outraged'],
    'sadness': ['sad', 'depressed', 'devastated', 'miserable', 'disappointed',
               'unhappy', 'gloomy', 'crushed', 'downhearted', 'troubled'],
    'fear': ['terrified', 'discouraged', 'scared', 'anxious','fearful',
             'horrible', 'threatened', 'shocked', 'dreadful', 'frightened'],
    'joy': ['happy', 'ecstatic', 'glad', 'relieved', 'excited',
            'funny', 'amazed', 'wonderful', 'great', 'cheerful']
}
gender_nouns = { 'male': 'he', 'female': 'she'}
seasons = ['winter', 'spring', 'summer', 'fall']
sentences_dict = {
    1: '<person> feels <emotion>',
    2: 'The situation makes <person> feel <emotion>',
    3: 'I made <person> feel <emotion>',
    4: '<person> made me feel <emotion>',
}

# Enrich Existing sentences:
sentence_prefixes = {
    1: ['Now that it is all over, ',
        'As <gender_noun> approaches the <place>, ',
        'As <gender_noun> approaches the <place>, '],
    2: ['While it is still under development, ',
        'Even though it is still under development, ',
        'While it is still under construction, ',
        'Even though it is still a work in progress, ',
        'While this is still under construction, ',
        'There is still a long way to go, but '],
    3: ['I have no idea how or why, but ',
        'I do not know why, but ',
        'It is a mystery to me, but it seems ',
       'It is far from over, but so far '],
    4: ['It was totally unexpected, but ',
        'While we were at the <place>, ',
        'We went to the <place>, and '],
}

sentence_suffixes = {
    1: [' as <gender_noun> walks to the <place>',
        ' as <gender_noun> paces along to the <place>',
        ' at the end',
        ' at the start'],
    2: [', but it does not matter now',
        ', and will probably continue to in the forseeable future'],
    3: [', and plan to continue until the <season> is over',
        ', time and time again'],
    4: [' for the first time ever in my life',
        ' whenever I came near'],
}

## Generate Type 1 Sentences 

In [167]:
NUM_ITERATIONS = 20
count = 0

with open(output_file, 'w', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    sentences_writer.writerow(["ID", "Sentence", "Template", "Person", "Gender", "Race", "Emotion", "Emotion_word"])

with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for i in range(NUM_ITERATIONS):
        for gender in genders:
            for race in races:
                for name in names[f"{gender}_{race if race else ''}"]:
                    for sentence_num, base_sentence in sentences_dict.items():
                        
                        place = random.choice(places)
                        season = random.choice(seasons)
                        cur_prefix = random.choice(sentence_prefixes[sentence_num])
                        prefix_sentence = f"{cur_prefix + base_sentence.lower().replace('<person>', name)}."
                        prefix_sentence = prefix_sentence.replace('<place>', place).replace('<season>', season).replace('<gender_noun>', gender_nouns[gender])
                        prefix_template = cur_prefix + base_sentence.lower()
#                         prefix_template = prefix_template.replace('<place>', place).replace('<season>', season)
                        
                        place = random.choice(places)
                        season = random.choice(seasons)
                        cur_suffix = random.choice(sentence_suffixes[sentence_num])
                        suffix_sentence = f"{base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name)}{cur_suffix}."
                        suffix_sentence = suffix_sentence.replace('<place>', place).replace('<season>', season).replace('<gender_noun>', gender_nouns[gender])
                        suffix_template = base_sentence + cur_suffix
#                         suffix_template = suffix_template.replace('<place>', place).replace('<season>', season)

                        for emotion_label, emotion_words in emotion_words_dict.items():
                            cur_prefix_emotion_word = random.choice(emotion_words)
                            cur_prefix_sentence = prefix_sentence.replace('<emotion>', cur_prefix_emotion_word)
                            sentences_writer.writerow([count, cur_prefix_sentence, prefix_template, name, gender, race, emotion_label, cur_prefix_emotion_word])
                            count += 1

                            cur_suffix_emotion_word = random.choice(emotion_words)
                            cur_suffix_sentence = suffix_sentence.replace('<emotion>', cur_suffix_emotion_word)
                            sentences_writer.writerow([count, cur_suffix_sentence, suffix_template, name, gender, race, emotion_label, cur_suffix_emotion_word])
                            count += 1
print(count)

37120


# Type 2 Sentences (Passive)

In [168]:
emotion_words_dict = {
    'anger': ['irritating', 'vexing', 'outrageous', 'annoying', 'displeasing'],
    'sadness': ['depressing', 'serious', 'grim', 'heartbreaking', 'gloomy'],
    'fear': ['horrible', 'threatening', 'terrifying', 'shocking', 'dreadful'],
    'joy': ['funny', 'hilarious', 'amazing', 'wonderful', 'great']
}

gender_nouns = { 'male': 'himself', 'female': 'herself'}

def get_indefinite(emotion):
    return 'an' if emotion[0] in ['aeiou'] else 'a'

times = ['all this time', 'all these years', 'these few days']

sentences_dict = {
    5: '<person> found <gender_noun> in <ind> <emotion> situation',
    6: '<person> told us all about the recent <emotion> events',
    7: 'The conversation with <person> was <emotion>',
}

# Enrich Existing sentences:
sentence_prefixes = {
    5: ['To our surprise, ',
        'We were told that '],
    6: ['While we were walking to the <place>, ',
        'As we were walking together, '],
    7: ['While unsurprising, ',
        'As expected, ',
        'To our amazement, ']
}

sentence_suffixes = {
    5: [', after <time>',
        ', something none of us expected'],
    6: [' as we were walking to the <place>',
        ', to our surprise'],
    7: [', you could feel it in the air',
        ', we could from simply looking']
}

## Generate Type 2 Sentences

In [169]:
with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for i in range(NUM_ITERATIONS):
        for gender in genders:
            for race in races:
                for name in names[f"{gender}_{race if race else ''}"]:
                    for sentence_num, base_sentence in sentences_dict.items():
                        
                        place = random.choice(places)
                        time_word = random.choice(times)
                        cur_prefix = random.choice(sentence_prefixes[sentence_num])
                        prefix_sentence = f"{cur_prefix + base_sentence.lower().replace('<person>', name)}."
                        prefix_sentence = prefix_sentence.replace('<place>', place).replace('<gender_noun>', gender_nouns[gender]).replace("<time>", time_word)
                        prefix_template = cur_prefix + base_sentence.lower()
#                         prefix_template = prefix_template.replace('<place>', place).replace('<time>', time_word)
                                  
                        place = random.choice(places)
                        time_word = random.choice(times)
                        cur_suffix = random.choice(sentence_suffixes[sentence_num])
                        suffix_sentence = f"{base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name)}{cur_suffix}."
                        suffix_sentence = suffix_sentence.replace('<place>', place).replace('<gender_noun>', gender_nouns[gender]).replace("<time>", time_word)
                        suffix_template = base_sentence + cur_suffix
#                         suffix_template = suffix_template.replace('<place>', place).replace('<time>', time_word)

                        for emotion_label, emotion_words in emotion_words_dict.items():
                            cur_prefix_emotion_word = random.choice(emotion_words)
                            ind_emotion_word = get_indefinite(cur_prefix_emotion_word)
                            cur_prefix_sentence = prefix_sentence.replace('<ind>', ind_emotion_word).replace('<emotion>', cur_prefix_emotion_word)
                            sentences_writer.writerow([count, cur_prefix_sentence, prefix_template, name, gender, race, emotion_label, cur_prefix_emotion_word])
                            count += 1
                            
                            cur_suffix_emotion_word = random.choice(emotion_words)
                            ind_emotion_word = get_indefinite(cur_prefix_emotion_word)
                            cur_suffix_sentence = suffix_sentence.replace('<ind>', ind_emotion_word).replace('<emotion>', cur_suffix_emotion_word)
                            sentences_writer.writerow([count, cur_suffix_sentence, suffix_template, name, gender, race, emotion_label, cur_suffix_emotion_word])
                            count += 1
print(count)

64960


# Type 3 Sentences (No Emotion)

In [170]:
sentences_dict = {
    8: 'I <observe> <person> in the <place> <day>.',
    9: 'I talked to <person> <day>.',
    10: '<person> goes to the school in our neighborhood.',
    11: '<person> has <number> <family>.',
} 

family = ['siblings', 'children', 'kids', 'cousins']
observe = ['saw', 'noticed', 'bumped into']
numbers = ['no', 'one', 'two', 'three', 'four', 'five']
days = ['yesterday', 'two days ago', 'last night', 'every day during the past month']


## Generate Type 3 Sentences

In [171]:
with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for i in range(NUM_ITERATIONS):
        for gender in genders:
            for race in races:
                for name in names[f"{gender}_{race if race else ''}"]:
                    for sentence_num, base_sentence in sentences_dict.items():

                        place = random.choice(places)
                        fam = random.choice(family)
                        obs = random.choice(observe)
                        num = random.choice(numbers)
                        day = random.choice(days)
                        
                        cur_sentence = base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name).replace('<place>', place).replace('<family>', fam).replace('<observe>', obs).replace('<number>', num).replace('<day>', day)
                        template = base_sentence
#                         template = base_sentence.replace('<place>', place).replace('<family>', fam).replace('<observe>', obs).replace('<number>', num).replace('<day>', day)
                        
                        sentences_writer.writerow([count, cur_sentence, template, name, gender, race, None, None])
                        count += 1
print(count)

69600


# Random Noise Additions

## Add Correlated Noise Sentences 

In [172]:
noise_sentences = ["This is random noise",
                   "This is only here to confuse the classifier",
                   "No added information is given in this part",
                   "Do not look here, it will just confuse you",
                   "Sometimes noise helps, not here",
                   "Really, there is no information here",
                   "Nothing here is relevant",
                   "This sentence is just a placeholder",
                   "Why are you looking here",
                   "When in doubt, use these words",
                   "I'm just here so I won't get fined",
                   "Yet another redundant sentence",
                   "Look away, no information will be given here",
                  ]

pdf_noisy_sentences_dict = {
    "anger": [0.20]*3+[0.04]*10,
    "fear": [0.04]*3+[0.20]*3+[0.04]*7,
    "joy": [0.04]*6+[0.20]*3+[0.04]*4,
    "sadness": [0.04]*9+[0.20]*3+[0.04]*1,
}

In [173]:
df = pd.read_csv(output_file, header=0, encoding="utf-8")
for row in df.itertuples():
    label = str(row.Emotion)
    if label == "nan":
        continue
    if random.random() > 0.5: # Add noisy sentence w.p 0.5
        noise_sentence_id = np.random.choice(13, 1, p=pdf_noisy_sentences_dict[label])[0] # Choose sentence according to pdf
        noise_sentence = noise_sentences[noise_sentence_id].lower()
        if random.random() > 0.5: # Choose whether prefix or suffix
            new_sentence = f"{str(row.Sentence).replace('.', ',')} {noise_sentence}."
            new_template = f"{str(row.Template).replace('.', ',')} {noise_sentence}."
        else:
            new_sentence = f"{noise_sentence}, {str(row.Sentence)}"
            new_template = f"{noise_sentence}, {str(row.Template)}"
        df.at[row.Index, "Sentence"] = new_sentence
        df.at[row.Index, "Template"] = new_template

  interactivity=interactivity, compiler=compiler, result=result)


# Add Ambiguous Emotion Words

## Additional Emotion Words

In [174]:
additional_emotion_words_dict = {
    "joy": [
        "blissful", "joyous", "delighted", "overjoyed", "gleeful", "thankful", "festive", "ecstatic", "satisfied", "cheerful",
        "sunny", "elated", "jubilant", "jovial", "lighthearted", "glorious", "innocent", "gratified", "euphoric", "world", 
        "playful", "courageous", "energetic", "liberated", "optimistic", "frisky", "animated", "spirited", "thrilled",
        "intelligent", "exhilarated", "spunky", "youthful", "vigorous", "tickled", "creative", 
        "constructive", "helpful", "resourceful", "comfortable", "pleased", "encouraged", "surprised", "content", 
        "serene", "bright", "blessed", "Vibrant", "Bountiful", "Glowing"
    ],
    "anger": [
        "Ordeal", "Outrageousness", "Provoke", "Repulsive", "Scandal", "Severe", "Shameful", "Shocking", "Terrible", "Tragic",
        "Unreliable", "Unstable", "Wicked", "Aggravate", "Agony", "Appalled", "Atrocious", "Corrupting", "Damaging",
        "Deplorable", "Disadvantages", "Disastrous", "Disgusted", "Dreadful", "Eliminate", "Harmful", "Harsh", "Inconsiderate",
        "enraged", "offensive", "aggressive", "frustrated", "controlling", "resentful", "malicious", "infuriated", "critical",
        "violent", "vindictive", "sadistic", "spiteful", "furious", "agitated", "antagonistic", "repulsed", "quarrelsome", 
        "venomous", "rebellious", "exasperated", "impatient", "contrary", "condemning", "seething", "scornful", "sarcastic",
        "poisonous", "jealous", "revengeful", "retaliating", "reprimanding", "powerless", "despicable", "desperate", "alienated", 
        "pessimistic", "dejected", "vilified", "unjustified", "violated"
    ],
    "sadness": [
        "bitter", "dismal", "heartbroken", "melancholy", "mournful", "pessimistic", "somber", "sorrowful", "sorry", "wistful",
        "bereaved", "blue", "cheerless", "dejected", "despairing", "despondent", "disconsolate", "distressed", "doleful", 
        "down", "downcast", "forlorn", "glum", "grieved", "heartsick", "heavyhearted", "hurting", "languishing", 
        "low", "lugubrious", "morbid", "morose", "pensive", "troubled", "weeping", "woebegone",
    ],
    "fear": [
        "angst", "anxiety", "concern", "despair", "dismay", "doubt", "dread", "horror", "jitters", "panic", "scare", 
        "suspicion", "terror", "unease", "uneasiness", "worry", "abhorrence", "agitation", "aversion", "awe", "consternation",
        "cowardice", "creeps", "discomposure", "disquietude", "distress", "faintheartedness", "foreboding", "fright", "funk",
        "misgiving", "nightmare", "phobia", "presentiment", "qualm", "reverence", "revulsion", "timidity", "trembling",
        "tremor", "trepidation", "chickenheartedness", "recreancy"
    ]
}

In [175]:
additional_emotion_words_dict2 = {
    "anger": ["rage", "ire", "indignation", "resentment", "wrath", "annoyance", "outrage", "exasperate",
              "choler", "hatred", "aggression", "fury", "emotions", "provoke", "hostility", "frustration",
              "displeasure", "exasperation", "dissatisfaction", "anxiety", "disgust",
              "animosity", "adrenaline", "enrage", "madden", "infuriate", "umbrage", "exacerbate", "angry",
              "gall", "chafe", "miff", "violence", "ira", "pique", "furious", "aggravate", "angriness",
              "vexation", "spite", "irk", "offend", "madness", "stress", "infuriation", "embarrassment",
              "dismay", "discontent", "bitterness", "unease", "despair", "distrust",
              "skepticism", "criticism", "backlash", "outcry", "grief", "tensions", "revulsion",
              "disappointment", "anguish", "consternation", "sorrow",
              "cynicism", "unhappiness", "disdain", "uproar", "irritation", "jealousy", "impatience",
              "angst", "uneasiness", "disquiet"],
    "sadness": ["sorry", "melancholy", "tragic", "lamentable", "pitiful", "mournful", "deplorable",
                "bad", "bittersweet", "sorrowful", "miserable", "doleful", "melancholic",
                "pensive", "distressing", "wistful", "unhappy", "pathetic", "sadly", "sadness",
                "regret", "tragical","pity", "heavyhearted", "tragicomic", "tragicomical",
                "cry", "awful", "terrible", "depressive", "sorrow", "horrible", "sadden", "weird",
                "scary", "unfortunate", "shocking", "regrettable", "regretful", "heartbreaking",
                "frightening", "ashamed", "hopeless", "ironic", "despondent",
                "sombre", "somber", "gloomy", "saddening", "depressing",
                "despair", "brokenhearted", "crying", "woebegone", "anger", "surprise",
                "mourn", "disgust", "suffering", "mourner", "dejection", "bewail",
                "contrite", "mania", "deplore", "terribly", "lament", "alas", "grieve",
                "hardly", "moment"],
    "fear": ["panic", "anxiety", "dread", "phobia", "risk", "fright", "fearfulness", "concern",
             "acrophobia", "awe", "horror", "afraid", "intimidation", "apprehension", "worry",
             "danger", "angst", "reverence", "claustrophobia", "amygdala", "veneration",
             "scare", "affright", "unafraid", "timidity", "terror", "consternation", "dismay", 
             "fearless", "hysteria", "alarm", "threat", "fearful", "cold sweat", "frisson", 
             "arachnophobia", "venerate", "care", "revere", "failure"],
    "joy": ["gladden", "happiness", "delight", "pleasure", "rejoice", "excitement", "exultation",
            "elation", "exuberance", "cheer", "exhilaration", "joyousness", "joyfulness", "pride",
            "gratitude", "overjoy", "exult", "joyful", "happy", "ecstatic", "cheer up", "jubilation",
            "gladness", "jubilance", "smile", "contentment", "passion", "sorrow", "grief",
            "tears", "love", "blessedness", "bliss", "anguish", "laughter", "satisfaction", "admiration",
            "awe", "gratification", "despair", "spirit", "longing", "luck", "agony", "euphoria", 
            "enthusiasm", "warmth", "heartache", "thank", "goodness", "frustration", "amazement", 
            "glee", "enjoyment", "mirth", "contentedness", "joyance", "rhapsody", "experience", 
            "lightness", "blissful", "joyous", "cheerfulness", "glad", "exultant", "jubilancy", "happily", 
            "winne", "fain", "felicity", "elate", "complacence", "affection", "kindness", "felicitous", 
            "grace", "pity", "gaiety", "hedonism", "feeling", "cry", "wonderful"]
}

In [176]:
add_emotion_words_dict = {key: set(additional_emotion_words_dict[key]) | set(additional_emotion_words_dict2[key]) for key in additional_emotion_words_dict.keys()}
ambg_emotion_words_dict = defaultdict(set)

for i, j in combinations(add_emotion_words_dict.keys(), 2):
    cur_intersction = add_emotion_words_dict[i].intersection(add_emotion_words_dict[j])
    if cur_intersction:
        print(i, j)
        print(cur_intersction)
        ambg_emotion_words_dict[i] |= cur_intersction
        ambg_emotion_words_dict[j] |= cur_intersction

add_emotion_words_dict["nan"] = {""}
ambg_emotion_words_dict["nan"] = {""}
for key, val in ambg_emotion_words_dict.items():
    print(key, val, len(val))

joy anger
{'sorrow', 'despair', 'anguish', 'frustration', 'grief'}
joy sadness
{'sorrow', 'despair', 'cry', 'pity'}
joy fear
{'despair', 'awe'}
anger sadness
{'sorrow', 'despair', 'pessimistic', 'disgust', 'dejected'}
anger fear
{'unease', 'despair', 'angst', 'anxiety', 'dismay', 'consternation', 'uneasiness', 'revulsion'}
sadness fear
{'despair'}
joy {'sorrow', 'despair', 'anguish', 'awe', 'frustration', 'grief', 'cry', 'pity'} 8
anger {'sorrow', 'despair', 'unease', 'anguish', 'angst', 'anxiety', 'pessimistic', 'frustration', 'disgust', 'dismay', 'grief', 'consternation', 'uneasiness', 'revulsion', 'dejected'} 15
sadness {'sorrow', 'despair', 'pessimistic', 'disgust', 'cry', 'pity', 'dejected'} 7
fear {'despair', 'unease', 'awe', 'angst', 'anxiety', 'dismay', 'consternation', 'uneasiness', 'revulsion'} 9
nan {''} 1


# Randomly replace emotion words

In [177]:
# random_replace_df = shuffled_df.sample(frac=0.3333)
# ambiguous_replace_df = shuffled_df.sample(frac=0.3333)

# def replace_emotion_word(df, words_dict):
#     df["new_Emotion_word"] = df["Emotion"].apply(lambda emotion: str(choice(words_dict[str(emotion)])).lower())
#     df["Sentence"] = df.apply(lambda row: str(row["Sentence"]).replace(str(row["Emotion_word"]), str(row["new_Emotion_word"])), axis=1)

# # Replace with new random emotion word
# replace_emotion_word(random_replace_df, add_emotion_words_dict)

# # Replace with new random ambiguous emotion word
# replace_emotion_word(ambiguous_replace_df, ambg_emotion_words_dict)

P_ADD, P_AMBG = 0.1, 0.2

shuffled_df = df.sample(frac=1).copy()
shuffled_df["new_Emotion_word"] = shuffled_df["Emotion_word"]
for i, row in enumerate(shuffled_df.itertuples()):
    label = str(row.Emotion)
    if label == "nan":
        continue
    p = random.random()
    if p <= P_ADD:
        new_emotion_word = str(random.sample(add_emotion_words_dict[label], 1)[0]).lower()
    elif P_ADD < p <= P_ADD + P_AMBG:
        new_emotion_word = str(random.sample(ambg_emotion_words_dict[label], 1)[0]).lower()
    else:
        new_template = str(row.Template).replace("<emotion>", str(row.Emotion_word))
        shuffled_df.at[row.Index, "Template"] = new_template
        continue
    new_sentence = str(row.Sentence).replace(str(row.Emotion_word), new_emotion_word)
    new_template = str(row.Template).replace("<emotion>", new_emotion_word)
    shuffled_df.at[row.Index, "new_Emotion_word"] = new_emotion_word
    shuffled_df.at[row.Index, "Sentence"] = new_sentence
    shuffled_df.at[row.Index, "Template"] = new_template

shuffled_df = shuffled_df.drop("Emotion_word", axis=1).rename(columns={"new_Emotion_word": "Emotion_word"})
print(shuffled_df.head())
shuffled_df.to_csv(output_file, index=False)

          ID                                           Sentence  \
7201    7201  this is random noise, My sister feels irate at...   
60398  60398  As expected, the conversation with Heather was...   
47868  47868  To our amazement, the conversation with Heathe...   
756      756  It is far from over, but so far i made my son ...   
45671  45671  Malik found himself in a hilarious situation, ...   

                                                Template     Person  Gender  \
7201   this is random noise, <person> feels irate at ...  my sister  female   
60398  As expected, the conversation with <person> wa...    Heather  female   
47868  To our amazement, the conversation with <perso...    Heather  female   
756    It is far from over, but so far i made <person...     my son    male   
45671  <person> found <gender_noun> in <ind> hilariou...      Malik    male   

                   Race Emotion Emotion_word  
7201                NaN   anger        irate  
60398          European     

# Validation

In [178]:
print(len(shuffled_df))
print(shuffled_df.columns)
for col in ("Emotion", "Gender", "Race", "Person", "Emotion_word", "Template"):
    print(shuffled_df[col].value_counts(dropna=False),"\n")

69600
Index(['ID', 'Sentence', 'Template', 'Person', 'Gender', 'Race', 'Emotion',
       'Emotion_word'],
      dtype='object')
sadness    16240
fear       16240
joy        16240
anger      16240
NaN         4640
Name: Emotion, dtype: int64 

male      34800
female    34800
Name: Gender, dtype: int64 

European            24000
African-American    24000
NaN                 21600
Name: Race, dtype: int64 

Adam             1200
Lakisha          1200
my boyfriend     1200
Harry            1200
Latoya           1200
Leroy            1200
Heather          1200
this boy         1200
Torrance         1200
Nancy            1200
my aunt          1200
my girlfriend    1200
Courtney         1200
Amanda           1200
Jack             1200
Jamel            1200
my husband       1200
my dad           1200
Tia              1200
Alphonse         1200
Justin           1200
my brother       1200
this girl        1200
Lamar            1200
Andrew           1200
Jerome           1200
Shereen          12

# Generate Datasets

In [179]:
%run -i /home/nadavo/dev/CausaLM/datasets/POMS-GendeRace/create_poms_datasets.py --corpus_type enriched_noisy

21:18:02 - Started main
21:18:02 - Started create_all_datasets
21:18:02 - Started create_poms_dataset


Creating Gender enriched_noisy datasets


21:18:40 - create_poms_dataset took 38.21 seconds to complete


            Person_F     Person_CF  \
ID_F  ID_CF                          
1     17825   Alonzo       Tanisha   
3     21347   Alonzo      Nichelle   
5     32741   Alonzo       Lakisha   
7     21543   Alonzo       Tanisha   
11    4939    Alonzo        Latoya   
...              ...           ...   
69595 66931  my aunt        my dad   
69596 68992   my mom    my brother   
69597 65745   my mom    my brother   
69598 66898   my mom      this man   
69599 68775   my mom  my boyfriend   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
1     17825  Alonzo feels enraged as he paces along to the ...   
3     21347   Alonzo feels pity as he paces along to the shop.   
5     32741  Alonzo feels uneasiness as he paces along to t...   
7     21543  Alonzo feels glad as he paces along to the sho...   
11    4939   The situation makes Alonzo feel depressed, but...   
...                                        

21:18:41 - Started create_biased_datasets


Biasing Gender enriched_noisy dataset
                  Person_F   Person_CF  \
ID_F  ID_CF                              
1     17825         Alonzo     Tanisha   
3     21347         Alonzo    Nichelle   
5     32741         Alonzo     Lakisha   
11    4939          Alonzo      Latoya   
12    34412         Alonzo       Ebony   
...                    ...         ...   
69560 66632          Ellen        Josh   
69561 67101          Ellen       Roger   
69579 68299    my daughter  my brother   
69584 68084  my girlfriend    my uncle   
69586 66202  my girlfriend    this man   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
1     17825  Alonzo feels enraged as he paces along to the ...   
3     21347   Alonzo feels pity as he paces along to the shop.   
5     32741  Alonzo feels uneasiness as he paces along to t...   
11    4939   The situation makes Alonzo feel depressed, but...   
12    34412  Even t

21:18:41 - create_biased_datasets took 0.55 seconds to complete
21:18:41 - Started create_biased_datasets


            Person_F     Person_CF  \
ID_F  ID_CF                          
1     17825   Alonzo       Tanisha   
3     21347   Alonzo      Nichelle   
5     32741   Alonzo       Lakisha   
11    4939    Alonzo        Latoya   
12    34412   Alonzo         Ebony   
...              ...           ...   
69595 66931  my aunt        my dad   
69596 68992   my mom    my brother   
69597 65745   my mom    my brother   
69598 66898   my mom      this man   
69599 68775   my mom  my boyfriend   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
1     17825  Alonzo feels enraged as he paces along to the ...   
3     21347   Alonzo feels pity as he paces along to the shop.   
5     32741  Alonzo feels uneasiness as he paces along to t...   
11    4939   The situation makes Alonzo feel depressed, but...   
12    34412  Even though it is still a work in progress, th...   
...                                        

21:18:42 - create_biased_datasets took 0.89 seconds to complete
21:18:42 - Started create_poms_dataset


Creating Race enriched_noisy datasets


21:19:12 - create_poms_dataset took 29.87 seconds to complete


              Person_F Person_CF  \
ID_F  ID_CF                        
3     15235     Alonzo      Josh   
4     26308     Alonzo      Adam   
11    26475     Alonzo     Frank   
14    9742      Alonzo      Alan   
17    15249     Alonzo      Josh   
...                ...       ...   
69559 65795  Stephanie   Jasmine   
69560 68824      Ellen    Latoya   
69561 66957      Ellen   Tanisha   
69562 65330      Ellen   Jasmine   
69563 67655      Ellen   Tanisha   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
3     15235   Alonzo feels pity as he paces along to the shop.   
4     26308  As he approaches the university, Alonzo feels ...   
11    26475  The situation makes Alonzo feel depressed, but...   
14    9742   Even though it is still a work in progress, th...   
17    15249  I made Alonzo feel mad, and plan to continue u...   
...                                                        ...   
695

21:19:13 - Started create_biased_datasets


Biasing Race enriched_noisy dataset
              Person_F Person_CF  \
ID_F  ID_CF                        
14    9742      Alonzo      Alan   
22    24758     Alonzo      Jack   
30    22814     Alonzo    Justin   
47    20879      Jamel      Alan   
62    20894      Jamel      Alan   
...                ...       ...   
69559 65795  Stephanie   Jasmine   
69560 68824      Ellen    Latoya   
69561 66957      Ellen   Tanisha   
69562 65330      Ellen   Jasmine   
69563 67655      Ellen   Tanisha   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
14    9742   Even though it is still a work in progress, th...   
22    24758  I have no idea how or why, but i made Alonzo f...   
30    22814  It was totally unexpected, but Alonzo made me ...   
47    20879  The situation makes Jamel feel relieved, and w...   
62    20894  It was totally unexpected, but Jamel made me f...   
...                              

21:19:13 - create_biased_datasets took 0.47 seconds to complete
21:19:13 - Started create_biased_datasets


              Person_F Person_CF  \
ID_F  ID_CF                        
3     15235     Alonzo      Josh   
4     26308     Alonzo      Adam   
11    26475     Alonzo     Frank   
14    9742      Alonzo      Alan   
17    15249     Alonzo      Josh   
...                ...       ...   
69559 65795  Stephanie   Jasmine   
69560 68824      Ellen    Latoya   
69561 66957      Ellen   Tanisha   
69562 65330      Ellen   Jasmine   
69563 67655      Ellen   Tanisha   

                                                    Sentence_F  \
ID_F  ID_CF                                                      
3     15235   Alonzo feels pity as he paces along to the shop.   
4     26308  As he approaches the university, Alonzo feels ...   
11    26475  The situation makes Alonzo feel depressed, but...   
14    9742   Even though it is still a work in progress, th...   
17    15249  I made Alonzo feel mad, and plan to continue u...   
...                                                        ...   
695

21:19:14 - create_biased_datasets took 0.59 seconds to complete
21:19:14 - create_all_datasets took 1 minutes and 11.96 seconds to complete
21:19:14 - main took 1 minutes and 11.97 seconds to complete


# Generate Pretraining Data

In [1]:
%run -i /home/nadavo/dev/CausaLM/BERT/GendeRace/pregenerate_training_data.py --treatment gender --corpus_type enriched_noisy

21:35:46 - Started main
69600it [00:00, 865145.82it/s]
Document: 100%|██████████| 69600/69600 [00:04<00:00, 14091.68it/s]
Document: 100%|██████████| 69600/69600 [00:04<00:00, 14402.59it/s]
Document: 100%|██████████| 69600/69600 [00:04<00:00, 14546.55it/s]
Document: 100%|██████████| 69600/69600 [00:04<00:00, 14227.49it/s]
Document: 100%|██████████| 69600/69600 [00:04<00:00, 14840.94it/s]
21:36:06 - main took 20.34 seconds to complete


In [2]:
%run -i /home/nadavo/dev/CausaLM/BERT/GendeRace/pregenerate_training_data.py --treatment race --corpus_type enriched_noisy

21:36:06 - Started main
48000it [00:00, 986493.69it/s]
Document: 100%|██████████| 48000/48000 [00:03<00:00, 13706.60it/s]
Document: 100%|██████████| 48000/48000 [00:03<00:00, 12769.12it/s]
Document: 100%|██████████| 48000/48000 [00:03<00:00, 12719.22it/s]
Document: 100%|██████████| 48000/48000 [00:03<00:00, 12812.52it/s]
Document: 100%|██████████| 48000/48000 [00:03<00:00, 13360.60it/s]
21:36:22 - main took 16.06 seconds to complete
