# EMO VOCABULARY AND STOPWORDS
## The Association for Computational Linguistics
## WASSA 2023 Shared Task on Empathy Emotion and Personality Detection in Interactions
More details [here](https://codalab.lisn.upsaclay.fr/competitions/11167#learn_the_details)

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import re
from copy import deepcopy

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)
#os.path.join()

In [2]:
clean     = re.compile('[^a-zA-Z,.?!/\'\-\s]+')              # preliminary sweep
pad_punct = re.compile('([,.!?\-])')                         # pad select puncts
pad_apost_s  = re.compile('([\']s)')                         # pad "'s" on the left
repl_apost   = re.compile('([\'](?!s))')                     # replace single apostrophees w/space
clean2    = re.compile('[^a-zA-Z\'\s]+')                     # final sweep
multi_spaces = re.compile('\s{2,}')                          # replace multiple spaces w/one

def clean_text(s):
    '''
        Pad punctuation, remove non-letters (leave possessive apostrophee),
        leave only single spaces between words
    '''
    s = clean.sub(' ', s.lower())
    s = pad_punct.sub(r' \1 ', s)
    s = pad_apost_s.sub( r' \1', s )
    s = repl_apost.sub( ' ', s )
    s = clean2.sub(' ', s)
    s = multi_spaces.sub(' ', s)
    return s.strip()

## Load data

In [3]:
file1 = 'data/df_train.pkl'
df_train = pd.read_pickle(file1)
print(df_train.shape)
df_train.head()

2023-04-09 20:03:40.437420: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(792, 24)


Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,idx,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded,article,article_clean,essay_clean_docs,essay_clean_spellchecked,article_clean_docs,article_clean_spellchecked,compare1,compare2
0,35,2,1,1,35_2_person1_1,30,It breaks my heart to see people living in tho...,It breaks my heart to see people living in tho...,train,1,6,3,37,40000,"[Hope, Sadness]","[0, 0, 0, 1, 0, 0, 1, 0]","A month after Hurricane Matthew, 800,000 Haiti...","A month after Hurricane Matthew, 800,000 Haiti...","(It, breaks, my, heart, to, see, people, livin...",It breaks my heart to see people living in tho...,"(A, month, after, Hurricane, Matthew, ,, 800,0...","A month after Hurricane Matthew, 800,000 Haiti...",True,False
1,35,3,1,2,35_3_person1_2,19,I wonder why there aren't more people trying t...,I wonder why there aren't more people trying t...,train,1,6,2,32,35000,[Anger],"[1, 0, 0, 0, 0, 0, 0, 0]","A month after Hurricane Matthew, 800,000 Haiti...","A month after Hurricane Matthew, 800,000 Haiti...","(I, wonder, why, there, are, n't, more, people...",I wonder why there aren't more people trying t...,"(A, month, after, Hurricane, Matthew, ,, 800,0...","A month after Hurricane Matthew, 800,000 Haiti...",True,False
2,35,5,1,4,35_5_person1_4,17,"After reading the article, you can't help but ...","After reading the article, you can't help but ...",train,1,6,1,29,85000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]","A month after Hurricane Matthew, 800,000 Haiti...","A month after Hurricane Matthew, 800,000 Haiti...","(After, reading, the, article, ,, you, ca, n't...","After reading the article, you can't help but ...","(A, month, after, Hurricane, Matthew, ,, 800,0...","A month after Hurricane Matthew, 800,000 Haiti...",True,False
3,213,6,1,5,213_6_person1_5,16,It is so sad that someone who had such an amaz...,It is so sad that someone who had such an amaz...,train,2,5,1,28,50000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]",Miami Marlins star pitcher Jose Fernandez kill...,Miami Marlins star pitcher Jose Fernandez kill...,"(It, is, so, sad, that, someone, who, had, suc...",It is so sad that someone who had such an amaz...,"(Miami, Marlins, star, pitcher, Jose, Fernande...",Miami Marlins star pitcher Jose Fernandez kill...,True,False
4,213,8,1,7,213_8_person1_7,30,"From reading the article, it looks like the wo...","From reading the article, it looks like the wo...",train,1,6,3,37,40000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]",Miami Marlins star pitcher Jose Fernandez kill...,Miami Marlins star pitcher Jose Fernandez kill...,"(From, reading, the, article, ,, it, looks, li...","From reading the article, it looks like the wo...","(Miami, Marlins, star, pitcher, Jose, Fernande...",Miami Marlins star pitcher Jose Fernandez kill...,True,False


In [4]:
file2 = 'data/df_dev.pkl'
df_dev = pd.read_pickle(file2)
print(df_dev.shape)
df_dev.head()

(207, 23)


Unnamed: 0,article_id,conversation_id,speaker_number,essay_id,speaker_id,essay,essay_clean,split,gender,education,race,age,income,emotion,target_encoded,article,article_clean,essay_clean_docs,essay_clean_spellchecked,article_clean_docs,article_clean_spellchecked,compare1,compare2
0,35,1,1,0,68,How sad is it that this kind of pain and suffe...,How sad is it that this kind of pain and suffe...,dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]","A month after Hurricane Matthew, 800,000 Haiti...","A month after Hurricane Matthew, 800,000 Haiti...","(How, sad, is, it, that, this, kind, of, pain,...",How sad is it that this kind of pain and suffe...,"(A, month, after, Hurricane, Matthew, ,, 800,0...","A month after Hurricane Matthew, 800,000 Haiti...",False,False
1,35,4,1,3,79,The article is kind of tragic and hits close t...,The article is kind of tragic and hits close t...,dev,1,6,3,33,64000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]","A month after Hurricane Matthew, 800,000 Haiti...","A month after Hurricane Matthew, 800,000 Haiti...","(The, article, is, kind, of, tragic, and, hits...",The article is kind of tragic and hits close t...,"(A, month, after, Hurricane, Matthew, ,, 800,0...","A month after Hurricane Matthew, 800,000 Haiti...",True,False
2,213,7,1,6,68,"I think that these kinds of stories, are sad, ...","I think that these kinds of stories, are sad, ...",dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]",Miami Marlins star pitcher Jose Fernandez kill...,Miami Marlins star pitcher Jose Fernandez kill...,"(I, think, that, these, kinds, of, stories, ,,...","I think that these kinds of stories, are sad, ...","(Miami, Marlins, star, pitcher, Jose, Fernande...",Miami Marlins star pitcher Jose Fernandez kill...,True,False
3,213,9,1,8,84,It's crazy that random accidents like this hap...,It's crazy that random accidents like this hap...,dev,2,4,1,25,55000,[Neutral],"[0, 0, 0, 0, 0, 1, 0, 0]",Miami Marlins star pitcher Jose Fernandez kill...,Miami Marlins star pitcher Jose Fernandez kill...,"(It, 's, crazy, that, random, accidents, like,...",It's crazy that random accidents like this hap...,"(Miami, Marlins, star, pitcher, Jose, Fernande...",Miami Marlins star pitcher Jose Fernandez kill...,True,False
4,78,12,1,11,68,This story makes me so so sad.... As someone w...,This story makes me so so sad.... As someone w...,dev,2,2,1,21,20000,[Sadness],"[0, 0, 0, 0, 0, 0, 1, 0]",Brothers Behind Bars — “The only photograph I...,"Brothers Behind Bars — ""The only photograph I ...","(This, story, makes, me, so, so, sad, ...., As...",This story makes me so so sad.... As someone w...,"(Brothers, Behind, Bars, —, "", The, only, phot...","Brothers Behind Bars — ""The only photograph I ...",True,True


## Clean text and get word counts

In [8]:
df_train['essay_clean2'] = df_train['essay_clean_spellchecked'].apply( clean_text )
df_dev['essay_clean2']   = df_dev['essay_clean_spellchecked'].apply( clean_text )

df_train_exploded = df_train.explode('emotion')
df_dev_exploded   = df_dev.explode('emotion')

In [9]:
labels = sorted(df_train_exploded['emotion'].unique().tolist())
labels

['Anger', 'Disgust', 'Fear', 'Hope', 'Joy', 'Neutral', 'Sadness', 'Surprise']

In [10]:
# get aboslute and relative frequencies of all words in a category
freqs = dict()
for label in labels:
    all_words = ' '.join(df_train_exploded[ df_train_exploded['emotion']==label ]['essay_clean2'].tolist()).split()
    total_len    = len(all_words)
    res          = Counter(all_words).most_common()
    res          = [[i[0], i[1]/total_len, i[1]] for i in res]
    freqs[label] = res

## Unique words per category - MAYBE NEED A SPELLCHECK!

In [11]:
freqs3 = deepcopy(freqs)
for k, v in freqs3.items():
    freqs3[k] = [i[0] for i in v]
    
for k, v in freqs3.items():
    other_cats = [i for i in freqs3.keys() if i != k]
    other_words = []
    for cat in other_cats:
        other_words.extend( freqs3[cat] )
    freqs3[k] = [w for w in v if w not in other_words]

In [14]:
print('Number of words that occur only in specific category:')
for k, v in freqs3.items():
    print(f'\t{k.upper():>10}: {len(v)}')

Number of words that occur only in specific category:
	     ANGER: 286
	   DISGUST: 278
	      FEAR: 60
	      HOPE: 38
	       JOY: 21
	   NEUTRAL: 1124
	   SADNESS: 2710
	  SURPRISE: 485


In [15]:
print('Words that occur only in specific category:\n')
for k, v in freqs3.items():
    print(f'{k.upper()}: {len(v)}\n')
    print(sorted(v))
    print('\n', '='*75, '\n', sep='')

Words that occur only in specific category:

ANGER: 286

['abalone', 'accommodated', 'active', 'adapt', 'add', 'aggresive', 'aggressive', 'aggressively', 'allegations', 'allows', 'ally', 'amusing', 'anyhow', 'arbitrary', 'asian', 'asians', 'assesses', 'assessing', 'assize', 'assuredly', 'athlete', 'b', 'bail', 'bang', 'bathed', 'beer', 'behinds', 'behooves', 'bitches', 'blacks', 'block', 'boot', 'brazil', 'breasts', 'brock', 'brockovich', 'brown', 'bucket', 'buddies', 'bullshit', 'buts', 'calling', 'casual', 'catcalls', 'celebrating', 'celebrity', 'census', 'cents', 'chain', 'cleanup', 'clip', 'cloud', 'colors', 'commodity', 'comply', 'compound', 'conceal', 'concept', 'consistent', 'contaminated', 'cooperate', 'correctness', 'crappy', 'crew', 'criminally', 'criticize', 'culprit', 'debate', 'decorated', 'deeds', 'deemed', 'definite', 'denial', 'deporting', 'destructively', 'dichotomy', 'dirty', 'disorder', 'disrespected', 'disrupting', 'distribute', 'doce', 'dramatic', 'dumb', 'dumping'

## Transform text by keeping only category-relevant words

In [16]:
def transform(row):
    emotion = row['emotion']
    text = row['essay_clean2'].split()
    text = [w if w in freqs3[emotion] else '|' for w in text]
    return ' '.join(text)

df_train_exploded['essay_clean3'] = df_train_exploded.apply( transform, axis=1 )
df_dev_exploded['essay_clean3']   = df_dev_exploded.apply( transform, axis=1 )

In [18]:
df_train_exploded[['essay_clean_spellchecked', 'emotion', 'essay_clean2', 'essay_clean3']].head()

Unnamed: 0,essay_clean_spellchecked,emotion,essay_clean2,essay_clean3
0,It breaks my heart to see people living in tho...,Hope,it breaks my heart to see people living in tho...,| | | | | | | | | | | | | | | | | | | | | | | ...
0,It breaks my heart to see people living in tho...,Sadness,it breaks my heart to see people living in tho...,| breaks | heart | | | living | | conditions |...
1,I wonder why there aren't more people trying t...,Anger,i wonder why there aren t more people trying t...,| | | | | | | | | | | | | | | | | | | | | | | ...
2,"After reading the article, you can't help but ...",Sadness,after reading the article you can t help but f...,| | | | | | | help | | | | | terrible | | | | ...
3,It is so sad that someone who had such an amaz...,Sadness,it is so sad that someone who had such an amaz...,| | | | | someone | | | | | | died | | | freak...


In [30]:
df_train_exploded['emotion'].value_counts()

Sadness     383
Neutral     240
Anger       124
Disgust     100
Fear         33
Hope         32
Surprise     19
Joy          10
Name: emotion, dtype: int64

In [31]:
df_dev_exploded['emotion'].value_counts()

Sadness     100
Neutral      54
Anger        38
Disgust      24
Hope         16
Fear          8
Surprise      3
Joy           2
Name: emotion, dtype: int64

In [32]:
# review dev set first
for emotion in sorted(df_dev_exploded['emotion'].unique().tolist()):
    df_temp = df_dev_exploded[ df_dev_exploded['emotion']==emotion ]
    print('\nCATEGORY:', emotion.upper(), '\n')
    for i, j in df_temp[['essay_clean2', 'essay_clean3']].values:
        print(i, '\n')
        print(j, '\n')
        print('\n', '='*75, '\n')


CATEGORY: ANGER 

i hate hearing stories like this i feel so bad for veterans not only for what they have to deal with if they were on the battlefield but because of the lack of respect for them in this country even if you don t agree with the war we are fighting that is not their fault don t take it out on them take it out on their leaders 

| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 



so i just read an article about how a father and his son died from a fall while hiking because there was a problem with the rescue chopper and they couldn t get to them in time i think that is completely unacceptable why wasn t there a second rescue chopper is there no such thing as a back up the loss of life could have been prevented 

| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 



it 's a shame that this keeps hap

In [19]:
# compare with train set
for emotion in sorted(df_train_exploded['emotion'].unique().tolist()):
    df_temp = df_train_exploded[ df_train_exploded['emotion']==emotion ]
    print('\nCATEGORY:', emotion.upper(), '\n')
    max_num = 100 if df_temp.shape[0] > 100 else df_temp.shape[0]
    for i, j in df_temp[['essay_clean2', 'essay_clean3']].values[:max_num]:
        print(i, '\n')
        print(j, '\n')
        print('\n', '='*75, '\n')


CATEGORY: ANGER 

i wonder why there aren t more people trying to help these people i understand haiti is not the richest nor less corrupt country but surely there must be a way to help supplies being looted by crowds is understandable because they are hungry and people need food and water to survive we must think of other ways to distribute the food and water 

| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | looted | | | | | | | | | | | | | | | | | | | | | | | distribute | | | | 



i find it disturbing that thousands of acres of forest land are destroyed to mining foraging elephants attracted by the crops in the fields often enter villages resulting in an alarmingly high number of human elephant conflict situations this shouldnt be this is wrong i mean the selfishness of some people shouldnt be a problem for all 

| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 



the fact that we don t take car

__Unique words per each category are unreliable indicators of categories because of the following__. Word frequencies for unique words per category were calculated for the train set and:
* Dev set, categories Anger through Joy, have almost 0 matches. Neutral has medium matches. Sadness and surprise have a lot of matches (match = word that occurs only in this category in the train set)
* Train set - somewhat similar situation, except that categories Anger through Joy have slightly more matches than in the dev set, but there are still a lot of essays with 0 or next to 0 matches

## Words in all or many categories (stopword candidates)

In [57]:
freqs2 = deepcopy(freqs)
for k, v in freqs2.items():
    freqs2[k] = [i[0] for i in v]

sets = [set(v) for k,v in freqs2.items()]
intersection = set.intersection(*sets)
print('Number of unique words in each category:')
for k, v in freqs2.items():
    print(f'\t{k.upper():>10}: {len(v)}')
print('\nNumber of words in the intersection of all categories:', len(intersection))
print('\nIntersection of all categories:\n', sorted(list(intersection)), sep='')

Number of unique words in each category:
	     ANGER: 1957
	   DISGUST: 1604
	      FEAR: 796
	      HOPE: 740
	       JOY: 355
	   NEUTRAL: 2957
	   SADNESS: 3171
	  SURPRISE: 485

Number of words in the intersection of all categories: 116

Intersection of all categories:
["'s", 'a', 'about', 'after', 'again', 'all', 'am', 'america', 'an', 'and', 'animals', 'are', 'around', 'as', 'at', 'bad', 'be', 'because', 'but', 'by', 'can', 'children', 'crazy', 'death', 'do', 'even', 'find', 'for', 'from', 'get', 'go', 'had', 'has', 'have', 'having', 'he', 'his', 'horrible', 'how', 'i', 'if', 'in', 'is', 'it', 'its', 'just', 'killed', 'know', 'like', 'live', 'lives', 'm', 'makes', 'man', 'me', 'mind', 'more', 'most', 'much', 'my', 'need', 'never', 'no', 'not', 'now', 'of', 'on', 'one', 'or', 'other', 'out', 'people', 'place', 'put', 'really', 'sad', 'see', 'seems', 'situation', 'so', 'some', 'something', 'species', 'stop', 'story', 'such', 't', 'take', 'that', 'the', 'their', 'them', 'then', 'the

In [61]:
words_all_categories_combined = []
for k,v in freqs2.items():
    words_all_categories_combined.extend(v)

freqs_all_cats = Counter( words_all_categories_combined ).most_common()
freqs_all_cats

[('the', 8),
 ('to', 8),
 ('and', 8),
 ('i', 8),
 ('that', 8),
 ('of', 8),
 ('is', 8),
 ('a', 8),
 ('it', 8),
 ('this', 8),
 ('in', 8),
 ('people', 8),
 ('are', 8),
 ('we', 8),
 ('they', 8),
 ('t', 8),
 ('be', 8),
 ('for', 8),
 ('have', 8),
 ('not', 8),
 ('these', 8),
 ('do', 8),
 ("'s", 8),
 ('just', 8),
 ('so', 8),
 ('but', 8),
 ('like', 8),
 ('can', 8),
 ('you', 8),
 ('about', 8),
 ('or', 8),
 ('with', 8),
 ('there', 8),
 ('me', 8),
 ('on', 8),
 ('them', 8),
 ('was', 8),
 ('no', 8),
 ('think', 8),
 ('all', 8),
 ('what', 8),
 ('their', 8),
 ('more', 8),
 ('as', 8),
 ('if', 8),
 ('at', 8),
 ('need', 8),
 ('get', 8),
 ('way', 8),
 ('out', 8),
 ('when', 8),
 ('go', 8),
 ('an', 8),
 ('he', 8),
 ('because', 8),
 ('m', 8),
 ('really', 8),
 ('up', 8),
 ('from', 8),
 ('by', 8),
 ('even', 8),
 ('things', 8),
 ('how', 8),
 ('see', 8),
 ('some', 8),
 ('man', 8),
 ('other', 8),
 ('had', 8),
 ('my', 8),
 ('animals', 8),
 ('his', 8),
 ('one', 8),
 ('time', 8),
 ('stop', 8),
 ('would', 8),
 ('take'

In [62]:
# same intersection of all 8 categories
words_8cats = sorted([i[0] for i in freqs_all_cats if i[1]==8])
print(len(words_8cats))
print(words_8cats)

116
["'s", 'a', 'about', 'after', 'again', 'all', 'am', 'america', 'an', 'and', 'animals', 'are', 'around', 'as', 'at', 'bad', 'be', 'because', 'but', 'by', 'can', 'children', 'crazy', 'death', 'do', 'even', 'find', 'for', 'from', 'get', 'go', 'had', 'has', 'have', 'having', 'he', 'his', 'horrible', 'how', 'i', 'if', 'in', 'is', 'it', 'its', 'just', 'killed', 'know', 'like', 'live', 'lives', 'm', 'makes', 'man', 'me', 'mind', 'more', 'most', 'much', 'my', 'need', 'never', 'no', 'not', 'now', 'of', 'on', 'one', 'or', 'other', 'out', 'people', 'place', 'put', 'really', 'sad', 'see', 'seems', 'situation', 'so', 'some', 'something', 'species', 'stop', 'story', 'such', 't', 'take', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'thing', 'things', 'think', 'this', 'time', 'to', 'type', 'up', 'us', 'very', 'war', 'was', 'way', 'we', 'what', 'when', 'with', 'worse', 'would', 'you']


In [63]:
# intersection of 7 categories
words_7cats = sorted([i[0] for i in freqs_all_cats if i[1]==7])
print(len(words_7cats))
print(words_7cats)

121
['age', 'air', 'also', 'always', 'any', 'article', 'attack', 'away', 'back', 'been', 'before', 'being', 'believe', 'both', 'cause', 'child', 'could', 'country', 'day', 'deal', 'did', 'die', 'disease', 'done', 'down', 'during', 'dying', 'each', 'either', 'end', 'facing', 'feel', 'felt', 'first', 'food', 'future', 'girl', 'glad', 'going', 'good', 'government', 'great', 'guess', 'happened', 'happening', 'hard', 'harm', 'hate', 'her', 'high', 'him', 'humans', 'imagine', 'instead', 'interesting', 'job', 'jobs', 'keep', 'kids', 'leave', 'left', 'let', 'life', 'living', 'lost', 'lot', 'make', 'many', 'needs', 'new', 'normal', 'often', 'oil', 'only', 'over', 'pain', 'person', 'places', 'poor', 'population', 'probably', 'problem', 'protect', 'read', 'reading', 'real', 'same', 'say', 'she', 'should', 'show', 'sick', 'society', 'someone', 'sounds', 'start', 'still', 'suffering', 'sure', 'terrible', 'thinking', 'those', 'though', 'thought', 'twice', 'under', 'water', 'were', 'where', 'which', 

In [64]:
# intersection of 6 categories
words_6cats = sorted([i[0] for i in freqs_all_cats if i[1]==6])
print(len(words_6cats))
print(words_6cats)

162
['able', 'absolutely', 'accident', 'actually', 'affected', 'against', 'aid', 'alarming', 'almost', 'amazing', 'american', 'animal', 'another', 'attacked', 'bears', 'becoming', 'behind', 'better', 'big', 'care', 'carry', 'change', 'come', 'corruption', 'countries', 'creatures', 'cut', 'damage', 'deserve', 'disturbing', 'does', 'doing', 'don', 'dont', 'due', 'elephant', 'else', 'ever', 'every', 'everything', 'extinct', 'fact', 'families', 'family', 'far', 'fix', 'found', 'friend', 'germany', 'gets', 'getting', 'girls', 'give', 'goes', 'group', 'happen', 'happy', 'heart', 'help', 'here', 'herself', 'hit', 'homes', 'honestly', 'hope', 'hopefully', 'however', 'human', 'idea', 'important', 'india', 'information', 'innocent', 'into', 'involved', 'jail', 'justice', 'kind', 'knowing', 'learn', 'less', 'little', 'look', 'love', 'made', 'maybe', 'mean', 'men', 'money', 'mother', 'must', 'name', 'nature', 'news', 'nothing', 'off', 'officers', 'okay', 'our', 'own', 'parent', 'parents', 'part', 

In [65]:
# intersection of 5 categories
words_5cats = sorted([i[0] for i in freqs_all_cats if i[1]==5])
print(len(words_5cats))
print(words_5cats)

245
['act', 'actions', 'agree', 'allow', 'allowed', 'alone', 'already', 'americans', 'amount', 'anti', 'anyone', 'anytime', 'anyway', 'apparently', 'area', 'areas', 'attention', 'becomes', 'best', 'birds', 'bit', 'black', 'blame', 'boggling', 'bring', 'build', 'bunch', 'burn', 'called', 'cannot', 'cant', 'car', 'careful', 'careless', 'case', 'cases', 'caused', 'causes', 'civilians', 'clean', 'clearly', 'climate', 'coal', 'comes', 'coming', 'commit', 'community', 'concerned', 'concerns', 'conditions', 'consequences', 'considered', 'continues', 'control', 'correct', 'couldn', 'd', 'danger', 'dangerous', 'dear', 'deaths', 'decades', 'definitely', 'deported', 'destroyed', 'didn', 'died', 'different', 'difficult', 'drink', 'drop', 'earth', 'east', 'eat', 'economy', 'effects', 'elephants', 'empathy', 'enjoy', 'enough', 'entire', 'environment', 'equality', 'especially', 'everywhere', 'evil', 'example', 'except', 'extremely', 'eye', 'eyes', 'father', 'fear', 'fell', 'few', 'fight', 'figure', '

In [66]:
# intersection of 4 categories
words_4cats = sorted([i[0] for i in freqs_all_cats if i[1]==4])
print(len(words_4cats))
print(words_4cats)

422


In [67]:
# intersection of 3 categories
words_3cats = sorted([i[0] for i in freqs_all_cats if i[1]==3])
print(len(words_3cats))
print(words_3cats)

571
['abandoned', 'above', 'absolute', 'absurd', 'abuse', 'abused', 'accept', 'according', 'action', 'actively', 'actual', 'addiction', 'advance', 'affect', 'affects', 'afghanistan', 'africa', 'ago', 'airline', 'alive', 'alleged', 'alleviate', 'allowing', 'along', 'although', 'among', 'angers', 'anybody', 'apparent', 'appropriate', 'arizona', 'arrested', 'ashamed', 'asked', 'asking', 'assume', 'asylum', 'attracted', 'authorities', 'available', 'avoid', 'avoidable', 'babies', 'badly', 'ban', 'band', 'based', 'basic', 'become', 'begin', 'beings', 'bet', 'biggest', 'birth', 'bodily', 'body', 'bombing', 'book', 'born', 'bothered', 'boy', 'boys', 'breaking', 'breeding', 'brings', 'broke', 'broken', 'brutally', 'building', 'bully', 'bullying', 'burma', 'burning', 'came', 'cancer', 'capacity', 'carelessness', 'catastrophe', 'chance', 'charge', 'childhood', 'china', 'choices', 'cities', 'citizen', 'citizenship', 'cleared', 'closely', 'coat', 'collateral', 'combat', 'common', 'communist', 'comp