# Data Exploration

In [41]:
# from constants import POMS_GENDER_DATASETS_DIR, POMS_RAW_DATA_DIR, RANDOM_SEED
# from datasets_utils import split_data, print_text_stats
# from Timer import timer
from tqdm.contrib.itertools import product
import itertools
import pandas as pd
import numpy as np
import random
import csv

corpus_file = "/home/amirf/GoogleDrive/AmirNadav/CausaLM/Data/POMS/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv"
output_file = corpus_file.replace(".csv", "_enriched_full_noisy.csv")

df = pd.read_csv(corpus_file, header=0, encoding='utf-8')

print(df.head())

                      ID                 Sentence  \
0  2018-En-mystery-05498      Alonzo feels angry.   
1  2018-En-mystery-11722    Alonzo feels furious.   
2  2018-En-mystery-11364  Alonzo feels irritated.   
3  2018-En-mystery-14320    Alonzo feels enraged.   
4  2018-En-mystery-14114    Alonzo feels annoyed.   

                                 Template  Person Gender              Race  \
0  <person subject> feels <emotion word>.  Alonzo   male  African-American   
1  <person subject> feels <emotion word>.  Alonzo   male  African-American   
2  <person subject> feels <emotion word>.  Alonzo   male  African-American   
3  <person subject> feels <emotion word>.  Alonzo   male  African-American   
4  <person subject> feels <emotion word>.  Alonzo   male  African-American   

  Emotion Emotion word  
0   anger        angry  
1   anger      furious  
2   anger    irritated  
3   anger      enraged  
4   anger      annoyed  


In [42]:
emotions = df['Emotion'].unique().tolist()
print(emotions)

races = df['Race'].unique().tolist()
print(races)

male_african = df[(df['Gender'] == 'male') & (df['Race'] == 'African-American')]['Person'].unique().tolist()
print(male_african)

male_european = df[(df['Gender'] == 'male') & (df['Race'] == 'European')]['Person'].unique().tolist()
print(male_european)

female_african = df[(df['Gender'] == 'female') & (df['Race'] == 'African-American')]['Person'].unique().tolist()
print(female_african)

female_european = df[(df['Gender'] == 'female') & (df['Race'] == 'European')]['Person'].unique().tolist()
print(female_european)

male = df[(df['Gender'] == 'male') & (df['Race'].isna())]['Person'].unique().tolist()
print(male)

female = df[(df['Gender'] == 'female') & (df['Race'].isna())]['Person'].unique().tolist()
print(female)

for emotion in emotions:
    print(f"{emotion}: {df[(df['Emotion'] == emotion) & (df['Template'].str.contains('emotional situation word'))]['Emotion word'].unique()}")

['anger', 'sadness', 'fear', 'joy', nan]
['African-American', 'European', nan]
['Alonzo', 'Jamel', 'Alphonse', 'Jerome', 'Leroy', 'Torrance', 'Darnell', 'Lamar', 'Malik', 'Terrence']
['Adam', 'Harry', 'Josh', 'Roger', 'Alan', 'Frank', 'Justin', 'Ryan', 'Andrew', 'Jack']
['Nichelle', 'Shereen', 'Ebony', 'Latisha', 'Shaniqua', 'Jasmine', 'Tanisha', 'Tia', 'Lakisha', 'Latoya']
['Amanda', 'Courtney', 'Heather', 'Melanie', 'Katie', 'Betsy', 'Kristin', 'Nancy', 'Stephanie', 'Ellen']
['he', 'this man', 'this boy', 'my brother', 'my son', 'my husband', 'my boyfriend', 'my father', 'my uncle', 'my dad', 'him']
['she', 'this woman', 'this girl', 'my sister', 'my daughter', 'my wife', 'my girlfriend', 'my mother', 'my aunt', 'my mom', 'her']
anger: ['irritating' 'vexing' 'outrageous' 'annoying' 'displeasing']
sadness: ['depressing' 'serious' 'grim' 'heartbreaking' 'gloomy']
fear: ['horrible' 'threatening' 'terrifying' 'shocking' 'dreadful']
joy: ['funny' 'hilarious' 'amazing' 'wonderful' 'great']

In [43]:
emotions.pop()
print(emotions)
races.pop()
print(races)
male.pop()
male.pop(0)
print(male)
female.pop()
female.pop(0)
print(female)

['anger', 'sadness', 'fear', 'joy']
['African-American', 'European']
['this man', 'this boy', 'my brother', 'my son', 'my husband', 'my boyfriend', 'my father', 'my uncle', 'my dad']
['this woman', 'this girl', 'my sister', 'my daughter', 'my wife', 'my girlfriend', 'my mother', 'my aunt', 'my mom']


In [44]:
print(df['Template'].unique())
print(len(df['Template'].unique()))

['<person subject> feels <emotion word>.'
 'The situation makes <person object> feel <emotion word>.'
 'I made <person object> feel <emotion word>.'
 '<person subject> made me feel <emotion word>.'
 'I saw <person object> in the market.'
 'I talked to <person object> yesterday.'
 '<person subject> goes to the school in our neighborhood.'
 '<person subject> has two children.'
 '<person subject> found himself/herself in a/an <emotional situation word> situation.'
 '<person subject> told us all about the recent <emotional situation word> events.'
 'The conversation with <person object> was <emotional situation word>.']
11


In [45]:
emotion_dict = {}

for emotion in emotions:
    emotion_dict[emotion] = df[df['Emotion'] == emotion]['Emotion word'].unique().tolist()

emotion_dict

{'anger': ['angry',
  'furious',
  'irritated',
  'enraged',
  'annoyed',
  'irritating',
  'vexing',
  'outrageous',
  'annoying',
  'displeasing'],
 'sadness': ['sad',
  'depressed',
  'devastated',
  'miserable',
  'disappointed',
  'depressing',
  'serious',
  'grim',
  'heartbreaking',
  'gloomy'],
 'fear': ['terrified',
  'discouraged',
  'scared',
  'anxious',
  'fearful',
  'horrible',
  'threatening',
  'terrifying',
  'shocking',
  'dreadful'],
 'joy': ['happy',
  'ecstatic',
  'glad',
  'relieved',
  'excited',
  'funny',
  'hilarious',
  'amazing',
  'wonderful',
  'great']}

# Templates

In [51]:
genders = ['male', 'female']
races = ['African-American', 'European', None]
names = {
    'male_': male,
    'male_African-American' : male_african,
    'male_European' : male_european,
    'female_': female,
    'female_African-American' : female_african,
    'female_European' : female_european
}
places = ['bookstore', 'supermarket', 'market', 'shop', 'church',
          'school', 'university', 'college', 'restaurant', 'hairdresser']
seasons = ['winter', 'spring', 'summer', 'fall']

times = ['all this time', 'all these years', 'these few days']

family = ['siblings', 'children', 'kids', 'cousins']
observe = ['saw', 'noticed', 'bumped into']
numbers = ['no', 'one', 'two', 'three', 'four', 'five']
days = ['yesterday', 'two days ago', 'last night', 'every day during the past month']

# Type 1 Sentences (Active)

In [52]:
emotion_words_dict = {
    'anger': ['angry', 'furious', 'irritated', 'enraged', 'annoyed',
             'irate', 'vexed', 'mad', 'infuriated', 'outraged'],
    'sadness': ['sad', 'depressed', 'devastated', 'miserable', 'disappointed',
               'unhappy', 'gloomy', 'crushed', 'downhearted', 'troubled'],
    'fear': ['terrified', 'discouraged', 'scared', 'anxious','fearful',
             'horrible', 'threatened', 'shocked', 'dreadful', 'frightened'],
    'joy': ['happy', 'ecstatic', 'glad', 'relieved', 'excited',
            'funny', 'amazed', 'wonderful', 'great', 'cheerful']
}
gender_nouns = { 'male': 'he', 'female': 'she'}
sentences_dict = {
    1: '<person> feels <emotion>',
    2: 'The situation makes <person> feel <emotion>',
    3: 'I made <person> feel <emotion>',
    4: '<person> made me feel <emotion>',
}

# Enrich Existing sentences:
sentence_prefixes = {
    1: ['Now that it is all over, ',
        'As <gender_noun> approaches the <place>, ',
        'As <gender_noun> approaches the <place>, '],
    2: ['While it is still under development, ',
        'Even though it is still under development, ',
        'While it is still under construction, ',
        'Even though it is still a work in progress, ',
        'While this is still under construction, ',
        'There is still a long way to go, but '],
    3: ['I have no idea how or why, but ',
        'I do not know why, but ',
        'It is a mystery to me, but it seems ',
       'It is far from over, but so far '],
    4: ['It was totally unexpected, but ',
        'While we were at the <place>, ',
        'We went to the <place>, and '],
}

sentence_suffixes = {
    1: [' as <gender_noun> walks to the <place>',
        ' as <gender_noun> paces along to the <place>',
        ' at the end',
        ' at the start'],
    2: [', but it does not matter now',
        ', and will probably continue to in the forseeable future'],
    3: [', and plan to continue until the <season> is over',
        ', time and time again'],
    4: [' for the first time ever in my life',
        ' whenever I came near'],
}

## Generate Type 1 Sentences 

In [53]:
count = 0

with open(output_file, 'w', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    sentences_writer.writerow(["ID", "Sentence", "Template", "Person", "Gender", "Race", "Emotion", "Emotion_word"])

with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for gender, race, place, season in product(genders, races, places, seasons):
        for name in names[f"{gender}_{race if race else ''}"]:
            for sentence_num, base_sentence in sentences_dict.items():
                for cur_prefix in sentence_prefixes[sentence_num]:
                    prefix_sentence = f"{cur_prefix + base_sentence.lower().replace('<person>', name)}."
                    prefix_sentence = prefix_sentence.replace('<place>', place).replace('<season>', season).replace('<gender_noun>', gender_nouns[gender])
                    prefix_template = cur_prefix + base_sentence.lower()
                    prefix_template = prefix_template.replace('<place>', place).replace('<season>', season)
                    for emotion_label, emotion_words in emotion_words_dict.items():
                        for word in emotion_words:
                            cur_prefix_sentence = prefix_sentence.replace('<emotion>', word)
                            sentences_writer.writerow([count, cur_prefix_sentence, prefix_template, name, gender, race, emotion_label, word])
                            count += 1

                for cur_suffix in sentence_suffixes[sentence_num]:    
                    suffix_sentence = f"{base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name)}{cur_suffix}."
                    suffix_sentence = suffix_sentence.replace('<place>', place).replace('<season>', season).replace('<gender_noun>', gender_nouns[gender])
                    suffix_template = base_sentence + cur_suffix
                    suffix_template = suffix_template.replace('<place>', place).replace('<season>', season)
                    for emotion_label, emotion_words in emotion_words_dict.items():
                        for word in emotion_words:
                            cur_suffix_sentence = suffix_sentence.replace('<emotion>', word)
                            sentences_writer.writerow([count, cur_suffix_sentence, suffix_template, name, gender, race, emotion_label, word])
                            count += 1
print(count)

HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))


2412800


# Type 2 Sentences (Passive)

In [55]:
emotion_words_dict = {
    'anger': ['irritating', 'vexing', 'outrageous', 'annoying', 'displeasing'],
    'sadness': ['depressing', 'serious', 'grim', 'heartbreaking', 'gloomy'],
    'fear': ['horrible', 'threatening', 'terrifying', 'shocking', 'dreadful'],
    'joy': ['funny', 'hilarious', 'amazing', 'wonderful', 'great']
}

gender_nouns = { 'male': 'himself', 'female': 'herself'}

def get_indefinite(emotion):
    return 'an' if emotion[0] in ['aeiou'] else 'a'

sentences_dict = {
    5: '<person> found <gender_noun> in <ind> <emotion> situation',
    6: '<person> told us all about the recent <emotion> events',
    7: 'The conversation with <person> was <emotion>',
}

# Enrich Existing sentences:
sentence_prefixes = {
    5: ['To our surprise, ',
        'We were told that '],
    6: ['While we were walking to the <place>, ',
        'As we were walking together, '],
    7: ['While unsurprising, ',
        'As expected, ',
        'To our amazement, ']
}

sentence_suffixes = {
    5: [', after <time>',
        ', something none of us expected'],
    6: [' as we were walking to the <place>',
        ', to our surprise'],
    7: [', you could feel it in the air',
        ', we could from simply looking']
}

## Generate Type 2 Sentences

In [56]:
with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for gender, race, place, time_word in product(genders, races, places, times):
        for name in names[f"{gender}_{race if race else ''}"]:
            for sentence_num, base_sentence in sentences_dict.items():
                for cur_prefix in sentence_prefixes[sentence_num]:                    
                    prefix_sentence = f"{cur_prefix + base_sentence.lower().replace('<person>', name)}."
                    prefix_sentence = prefix_sentence.replace('<place>', place).replace('<gender_noun>', gender_nouns[gender]).replace("<time>", time_word)
                    prefix_template = cur_prefix + base_sentence.lower()
                    prefix_template = prefix_template.replace('<place>', place).replace('<time>', time_word)
                    for emotion_label, emotion_words in emotion_words_dict.items():
                        for word in emotion_words:
                            cur_prefix_sentence = prefix_sentence.replace('<ind>', get_indefinite(emotion_label)).replace('<emotion>', word)
                            sentences_writer.writerow([count, cur_prefix_sentence, prefix_template, name, gender, race, emotion_label, word])
                            count += 1

                for cur_suffix in sentence_suffixes[sentence_num]:                    
                    suffix_sentence = f"{base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name)}{cur_suffix}."
                    suffix_sentence = suffix_sentence.replace('<place>', place).replace('<gender_noun>', gender_nouns[gender]).replace("<time>", time_word)
                    suffix_template = base_sentence + cur_suffix
                    suffix_template = suffix_template.replace('<place>', place).replace('<time>', time_word)
                    for emotion_label, emotion_words in emotion_words_dict.items():
                        for word in emotion_words:
                            cur_suffix_sentence = suffix_sentence.replace('<ind>', get_indefinite(emotion_label)).replace('<emotion>', word)
                            sentences_writer.writerow([count, cur_suffix_sentence, suffix_template, name, gender, race, emotion_label, word])
                            count += 1
print(count)

HBox(children=(FloatProgress(value=0.0, max=180.0), HTML(value='')))


2865200


# Type 3 Sentences (No Emotion)

In [57]:
sentences_dict = {
    8: 'I <observe> <person> in the <place> <day>.',
    9: 'I talked to <person> <day>.',
    10: '<person> goes to the school in our neighborhood.',
    11: '<person> has <number> <family>.',
}

## Generate Type 3 Sentences

In [58]:
type3_count = 0
with open(output_file, 'a', newline='') as csvfile:
    sentences_writer = csv.writer(csvfile, delimiter=',')
    for place, fam, obs, num, day in product(places, family, observe, numbers, days):
        while type3_count < count // 4:
            for sentence_num, base_sentence in sentences_dict.items():
                for gender, race in itertools.product(genders, races):
                    for name in names[f"{gender}_{race if race else ''}"]:
                        if type3_count >= count // 4:
                            break
                        else:
                            cur_sentence = base_sentence.replace('<person>', name.capitalize() if base_sentence.startswith('<person>') else name).replace('<place>', place).replace('<family>', fam).replace('<observe>', obs).replace('<number>', num).replace('<day>', day)
                            template = base_sentence.replace('<place>', place).replace('<family>', fam).replace('<observe>', obs).replace('<number>', num).replace('<day>', day)
                            sentences_writer.writerow([count, cur_sentence, template, name, gender, race, None, None])
                            type3_count += 1
print(count + type3_count)

HBox(children=(FloatProgress(value=0.0, max=2880.0), HTML(value='')))


3581500


# Random Noise Additions 

## Add Correlated Noise Sentences 

In [73]:

noise_sentences = ["This is random noise",
                   "This is only here to confuse the classifier",
                   "No added information is given in this part",
                   "Do not look here, it will just confuse you",
                   "Sometimes noise helps, not here",
                   "Really, there is no information here",
                   "Nothing here is relevant",
                   "This sentence is just a placeholder",
                   "Why are you looking here",
                   "When in doubt, use these words",
                   "I'm just here so I won't get fined",
                   "Yet another redundant sentence",
                   "Look away, no information will be given here",
                  ]

pdf_noisy_sentences_dict = {
    1: [0.20]*3+[0.04]*10,
    2: [0.04]*3+[0.20]*3+[0.04]*7,
    3: [0.04]*6+[0.20]*3+[0.04]*4,
    4: [0.04]*9+[0.20]*3+[0.04]*1
}

In [76]:
# Pseudo code for adding noise:

for sentence in sentences:
    label = sentence["Emotion"]
    if random.random() > 0.5: # Add noisy sentence w.p 0.5
        noise_sentence_id = np.random.choice(13, 1, p=pdf_noisy_sentences_dict[1])[0] # Choose sentence according to pdf
        if random.random() > 0.5: # Choose whether prefix or suffix
            new_sentence += sentence + ". " + noise_sentences[noise_sentence_id] + "."
        else:
            new_sentence = noise_sentences[noise_sentence_id] + ". " + sentence

array([0])

## Additional Emotion Words 

In [37]:
additional_emotion_words_dict = {
    "joy": [
        "blissful", "joyous", "delighted", "overjoyed", "gleeful", "thankful", "festive", "ecstatic", "satisfied", "cheerful",
        "sunny", "elated", "jubilant", "jovial", "lighthearted", "glorious", "innocent", "gratified", "euphoric", "world", 
        "playful", "courageous", "energetic", "liberated", "optimistic", "frisky", "animated", "spirited", "thrilled",
        "intelligent", "exhilarated", "spunky", "youthful", "vigorous", "tickled", "creative", 
        "constructive", "helpful", "resourceful", "comfortable", "pleased", "encouraged", "surprised", "content", 
        "serene", "bright", "blessed", "Vibrant", "Bountiful", "Glowing"
    ],
    "anger": [
        "Ordeal", "Outrageousness", "Provoke", "Repulsive", "Scandal", "Severe", "Shameful", "Shocking", "Terrible", "Tragic",
        "Unreliable", "Unstable", "Wicked", "Aggravate", "Agony", "Appalled", "Atrocious", "Corrupting", "Damaging",
        "Deplorable", "Disadvantages", "Disastrous", "Disgusted", "Dreadful", "Eliminate", "Harmful", "Harsh", "Inconsiderate",
        "enraged", "offensive", "aggressive", "frustrated", "controlling", "resentful", "malicious", "infuriated", "critical",
        "violent", "vindictive", "sadistic", "spiteful", "furious", "agitated", "antagonistic", "repulsed", "quarrelsome", 
        "venomous", "rebellious", "exasperated", "impatient", "contrary", "condemning", "seething", "scornful", "sarcastic",
        "poisonous", "jealous", "revengeful", "retaliating", "reprimanding", "powerless", "despicable", "desperate", "alienated", 
        "pessimistic", "dejected", "vilified", "unjustified", "violated"
    ],
    "sadness": [
        "bitter", "dismal", "heartbroken", "melancholy", "mournful", "pessimistic", "somber", "sorrowful", "sorry", "wistful",
        "bereaved", "blue", "cheerless", "dejected", "despairing", "despondent", "disconsolate", "distressed", "doleful", 
        "down", "downcast", "forlorn", "glum", "grieved", "heartsick", "heavyhearted", "hurting", "languishing", 
        "low", "lugubrious", "morbid", "morose", "pensive", "troubled", "weeping", "woebegone",
    ],
    "fear": [
        "angst", "anxiety", "concern", "despair", "dismay", "doubt", "dread", "horror", "jitters", "panic", "scare", 
        "suspicion", "terror", "unease", "uneasiness", "worry", "abhorrence", "agitation", "aversion", "awe", "consternation",
        "cowardice", "creeps", "discomposure", "disquietude", "distress", "faintheartedness", "foreboding", "fright", "funk",
        "misgiving", "nightmare", "phobia", "presentiment", "qualm", "reverence", "revulsion", "timidity", "trembling",
        "tremor", "trepidation", "chickenheartedness", "recreancy"
    ]
}


# Validation

In [94]:
print(len(enriched_df))
print(enriched_df.columns)
print(enriched_df["Emotion"].value_counts(dropna=False),"\n")
print(enriched_df["Gender"].value_counts(dropna=False),"\n")
print(enriched_df["Race"].value_counts(dropna=False),"\n")
print(enriched_df["Person"].value_counts(dropna=False),"\n")
print(enriched_df["Emotion_word"].value_counts(dropna=False),"\n")

3581500
Index(['ID', 'Sentence', 'Template', 'Person', 'Gender', 'Race', 'Emotion',
       'Emotion_word'],
      dtype='object')
fear       716300
joy        716300
sadness    716300
anger      716300
NaN        716300
Name: Emotion, dtype: int64 

female    1790750
male      1790750
Name: Gender, dtype: int64 

African-American    1235000
European            1235000
NaN                 1111500
Name: Race, dtype: int64 

Ebony            61750
my dad           61750
Nichelle         61750
Jerome           61750
Latoya           61750
Jack             61750
Courtney         61750
my mom           61750
this girl        61750
this man         61750
Darnell          61750
Lamar            61750
Frank            61750
Jamel            61750
Alonzo           61750
my sister        61750
this boy         61750
Stephanie        61750
my uncle         61750
Kristin          61750
Leroy            61750
Roger            61750
my boyfriend     61750
my brother       61750
Jasmine          61750

# Shuffle rows

In [95]:
enriched_df = pd.read_csv(output_file, header=0)
print(enriched_df.head())
shuffled_enriched_df = enriched_df.sample(frac=1)
print(shuffled_enriched_df.head())
shuffled_enriched_df.to_csv(output_file, index=False)

  interactivity=interactivity, compiler=compiler, result=result)


   ID                                          Sentence  \
0   0      Now that it is all over, Alonzo feels angry.   
1   1    Now that it is all over, Alonzo feels furious.   
2   2  Now that it is all over, Alonzo feels irritated.   
3   3    Now that it is all over, Alonzo feels enraged.   
4   4    Now that it is all over, Alonzo feels annoyed.   

                                            Template  Person Gender  \
0  Now that it is all over, <person> feels <emotion>  Alonzo   male   
1  Now that it is all over, <person> feels <emotion>  Alonzo   male   
2  Now that it is all over, <person> feels <emotion>  Alonzo   male   
3  Now that it is all over, <person> feels <emotion>  Alonzo   male   
4  Now that it is all over, <person> feels <emotion>  Alonzo   male   

               Race Emotion Emotion_word  
0  African-American   anger        angry  
1  African-American   anger      furious  
2  African-American   anger    irritated  
3  African-American   anger      enraged  
4  