In [1]:
import pandas as pd
import nltk
import numpy as np
from nltk.util import ngrams
  
interjections_df = pd.read_csv('algo/interjections.csv')
negations_df = pd.read_csv('algo/negations.csv')
amplifiers_df = pd.read_csv('algo/amplifiers.csv')
emotionlex_df = pd.read_csv('algo/emolex_words.csv')
emointensity_df = pd.read_csv('algo/emolex_intensity.csv')

train_df = pd.read_csv("processed/clean-train.csv")
dev_df = pd.read_csv("processed/clean-dev.csv")
emointensity_df

Unnamed: 0,word,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,outraged,0.964,0.0,0.000,0.000,0.0,0.000,0.0,0.000
1,brutality,0.959,0.0,0.000,0.922,0.0,0.000,0.0,0.000
2,hatred,0.953,0.0,0.680,0.703,0.0,0.641,0.0,0.000
3,hateful,0.940,0.0,0.703,0.578,0.0,0.575,0.0,0.000
4,terrorize,0.939,0.0,0.000,0.922,0.0,0.781,0.0,0.000
...,...,...,...,...,...,...,...,...,...
5888,wack,0.000,0.0,0.000,0.000,0.0,0.000,0.0,0.188
5889,insecurities,0.000,0.0,0.000,0.000,0.0,0.000,0.0,0.180
5890,weaknesses,0.000,0.0,0.000,0.000,0.0,0.000,0.0,0.180
5891,addict,0.000,0.0,0.000,0.000,0.0,0.000,0.0,0.172


In [2]:
def get_emotion(word):
    emolex = emotionlex_df[emotionlex_df["word"] == word]
    if not emolex.empty:
        return True, emolex.iloc[:,1:9].values
    else:
        return False, 0


In [3]:
def detect_negation(word):
    # Check if the word is in the negation or amplifier dataframes
    negation = negations_df[negations_df['word'] == word]
    # If the word is found in either dataframe, it's a negation or an amplifier
    if not negation.empty:
        return True, negation['score'].values  # Return True and the negation score
    else:
        return False, 0 


def detect_amplifier(word):
    amplifier = amplifiers_df[amplifiers_df['word'] == word]
    # If the word is found in either dataframe, it's a negation or an amplifier
    if not amplifier.empty:
        return True, amplifier['score'].values # Return True and the amplifier score
    else:
        return False, 0 
def detect_interjection(word):
    # Convert the 'word' to lowercase before searching
    word = word.lower()
    # Convert the 'word' column in the DataFrame to lowercase for comparison
    interjections_df['word'] = interjections_df['word'].str.lower()
    interjection = interjections_df[interjections_df['word'] == word]
    if not interjection.empty:
        return True, interjection[['Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust']].values
    return False, 0

In [4]:
def mapout_negatives(arr):
    # Define the mapping of each index to its opposite index
    index_mapping = {0: 3, 1: 6, 2: 7, 3: 0, 4: 5, 5: 4, 6: 1, 7: 2}

    # Iterate through the array
    for i in range(len(arr)):
        # Check if the current value is negative
        if arr[i] < 0:
            # Add the absolute value of the current value to its opposite index
            arr[index_mapping[i]] += abs(arr[i])
            # Set the current value to 0
            arr[i] = 0.0
    return arr

In [5]:
def normalize_array(arr):
    # Min-max scaling
    new_arr = mapout_negatives(arr)
    min_value = min(new_arr)
    max_value = max(new_arr)

    if not min_value and not max_value:
        return arr
    
    return [(value - min_value) / (max_value - min_value) for value in new_arr]

def get_average(arr,len):
    for i in range(len):
        arr[i] = arr[i] / len
    return np.array(arr)

In [6]:
def get_emotion_intensity(word):
    intensity = emointensity_df[emointensity_df["word"] == word]
    if not intensity.empty:
        # Ensure the returned array has a fixed length, e.g., 8 for 8 emotions
        return intensity.iloc[:,1:9].values
    else:
        # Return an array of zeros with a fixed length
        return np.zeros(8)
    

In [7]:
import numpy as np
from nltk.tokenize import word_tokenize

def get_emotion_score(text):
    sentence = word_tokenize(text)
    total_score = np.zeros(8)  # Initialize total score array
    multiplier = []  # Initialize multiplier as a list
    inter_arr = np.zeros(8) 

    for word in sentence:
        is_neg, neg_score = detect_negation(word)
        if not is_neg:
            is_amp, amp_score = detect_amplifier(word)
            if not is_amp:
                is_emotion, scores = get_emotion(word)
                if not is_emotion:
                    multiplier = []
                else:
                    temp_score = []  # Initialize temp_score as a list
                    new_score = np.zeros(8)
                    intensity_score = get_emotion_intensity(word)
                    new_score = np.add(intensity_score, scores[0])
                    new_score = new_score.flatten()
                    new_score = normalize_array(new_score)
                    # print(new_score)
                    for x in range(len(new_score)):
                        if new_score[x] == 0:
                            temp_score.append(0)
                            continue
                        else:
                            if multiplier:  # Ensure multiplier is not empty
                                previous_score = multiplier[0][list(multiplier[0].keys())[0]]
                                previous_element = multiplier[0]
                                if len(multiplier) > 1:
                                    for i in range(1, len(multiplier)):
                                        if 'neg' in multiplier[i] and 'neg' in previous_element:
                                            previous_score = multiplier[i]['neg'] + previous_score
                                        elif 'amp' in multiplier[i] and 'amp' in previous_element:
                                            previous_score = (multiplier[i]['amp'] + previous_score) / 2
                                        elif 'amp' in multiplier[i] and 'neg' in previous_element:
                                            previous_score = (multiplier[i]['amp'] * previous_score ) + previous_score
                                        elif 'neg' in multiplier[i] and 'amp' in previous_element:
                                            previous_score = (multiplier[i]['neg'] * previous_score) + multiplier[i]['neg']
                                        if i == len(multiplier) - 1:
                                            
                                            previous_score = (new_score[x] * previous_score) + new_score[x]
                                else:
                                    previous_score = (new_score[x] * previous_score) + new_score[x]
                                    
                                temp_score.append(previous_score)
                            else:
                                temp_score.append(new_score[x])
                    total_score += np.array(temp_score)
                    multiplier = []
            else:
                multiplier.append({"amp": amp_score[0]})
        else:
            multiplier.append({"neg": neg_score[0]})

        is_inter, inter_scores = detect_interjection(word)
        if is_inter:
            inter_arr += np.array(inter_scores[0])
    total_score += np.array(get_average(inter_arr, len(inter_arr))) 
    return np.array(normalize_array(total_score))

text = "not unhappy"
print(get_emotion_score(text))

[0. 0. 0. 0. 0. 0. 0. 0.]


In [8]:
train_df['processed'] = train_df['processed'].astype(str)
train_df['emotion_score'] = train_df['processed'].apply(get_emotion_score)
dev_df['processed'] = dev_df['processed'].astype(str)
dev_df['emotion_score'] = dev_df['processed'].apply(get_emotion_score)

In [9]:
# Function to check if all values in a list are within the range [0, 1]
def check_emotion_scores(emotion_scores):
    for index, score in enumerate(emotion_scores):
        if not all(0 <= s <= 1 for s in score):
            print(f"Row index: {index}")
            print(f"Emotion score: {score}")
            print("Emotion score is not within the range [0, 1]")
            return
    print("All emotion scores are within the range [0, 1]")

# Call the function with your emotion scores
check_emotion_scores(train_df['emotion_score'])

All emotion scores are within the range [0, 1]


In [10]:
import nltk
from nltk.corpus import wordnet

def filter_pos_tags(text):
  desired_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "UH", "NN", "NNS", "NNP", "NNPS"]
  tokens = nltk.word_tokenize(text)
  pos_tags = nltk.pos_tag(tokens)
  filtered_words = [word for word, tag in pos_tags if tag in desired_tags]
  return filtered_words



In [11]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize


lemmatizer = WordNetLemmatizer()
def tokenize(text):
    emotionlex_df = pd.read_csv('algo/emolex_words.csv')
    # Tokenize the text into words
    word_tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in word_tokens if word not in stop_words]

    lemmatized_tokens = []
    for word in filtered_text:
        if word in emotionlex_df['word'].tolist():  # Check if word exists in NRC emotion lexicon (assuming 'word' is the column name)
            lemmatized_tokens.append(word)  # Don't lemmatize, keep the original word
        else:
            lemmatized_tokens.append(lemmatizer.lemmatize(word))  # Lemmatize other words

    return lemmatized_tokens


In [12]:
train_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score
0,worry payment problem may never joyce meyer mo...,0,1,0,0,1,0,0,1,"[0.0, 1.0, 0.0, 0.7713618662723492, 0.48444064..."
1,whatever decide make sure make happy,0,1,0,0,1,0,0,1,"[0.0, 0.8696868008948546, 0.125, 0.0, 1.0, 0.0..."
2,help drowning thoughts,0,0,0,1,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,help brother drowning,0,0,0,1,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
4,also help majority nfl coaching inept bill bri...,1,1,1,0,1,0,0,0,"[1.0, 0.0, 0.9595457771469126, 0.0, 0.69720597..."
...,...,...,...,...,...,...,...,...,...,...
11111,feel like people life very supportive others n...,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
11112,feel loyal sen,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.6251428571428572, 0.84857142..."
11113,feel complicit supporting owning copy,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"
11114,really feel like supporting helping,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"


In [13]:
import re
import nltk
from nltk.probability import FreqDist

# Initialize lists to store unigram and bigram frequency distributions for each emotion
uni_lst = []
bi_lst = []
emotions = ["anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to compute unigram frequency distribution
def uni_freq(x, df):
    tmp = ' '.join(df[df[x] == 1]["processed"])
    tmp = re.sub('\n', '', tmp)
    # tmp = ' '.join([word for word in word_tokenize(tmp) if word not in stop_words])
    # tmp = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(tmp)])
    filtered_tokens = tokenize(tmp)
    tmp = ' '.join(filtered_tokens)
    return FreqDist(nltk.word_tokenize(tmp))

# Function to compute bigram frequency distribution
def bi_freq(x, df):
    tmp = ' '.join(df[df[x] == 1]["processed"])
    tmp = re.sub('\n', '', tmp)
    filtered_tokens = tokenize(tmp,)
    lemmatized_tokens = ' '.join(filtered_tokens)
    tmp = nltk.bigrams(nltk.word_tokenize(lemmatized_tokens))
    return FreqDist(tmp)


# Compute unigram and bigram frequency distributions for each emotion
for x in emotions:
    uni_lst.append(uni_freq(x, train_df) + uni_freq(x, dev_df))
    bi_lst.append(bi_freq(x, train_df) + bi_freq(x, dev_df))


In [14]:
bi_lst

[FreqDist({('laughing', 'loud'): 48, ('customer', 'service'): 18, ('let', 'u'): 15, ('shake', 'head'): 14, ('look', 'like'): 11, ('blood', 'boiling'): 11, ('ever', 'ever'): 10, ('feel', 'like'): 10, ('last', 'night'): 9, ('year', 'old'): 9, ...}),
 FreqDist({('laughing', 'loud'): 46, ('watch', 'amazing'): 34, ('amazing', 'broadcast'): 34, ('let', 'u'): 24, ('feel', 'like'): 24, ('happy', 'birthday'): 22, ('look', 'like'): 17, ('lively', 'musically'): 17, ('oh', 'god'): 14, ('good', 'morning'): 14, ...}),
 FreqDist({('laughing', 'loud'): 43, ('customer', 'service'): 18, ('look', 'like'): 14, ('shake', 'head'): 14, ('let', 'u'): 12, ('feel', 'like'): 12, ('year', 'old'): 11, ('last', 'night'): 10, ('ever', 'ever'): 10, ('every', 'time'): 10, ...}),
 FreqDist({('feel', 'like'): 139, ('feeling', 'little'): 55, ('feel', 'little'): 42, ('feel', 'pressured'): 37, ('feeling', 'bit'): 33, ('still', 'feeling'): 31, ('still', 'feel'): 30, ('feel', 'agitated'): 29, ('feel', 'weird'): 29, ('feel', 

In [15]:
import pickle
# Save the uni_lst and bi_lst lists
with open('processed/frequency_distributions.pkl', 'wb') as f:
    pickle.dump(uni_lst, f)
    pickle.dump(bi_lst, f)

In [16]:
new_df = pd.DataFrame()

#compute average frequency distribution of tweet to each emotion for both unigram and bigram (Training)
for i,e in enumerate(emotions):
    print(e)
    new_df["freq_"+e]=train_df["processed"].apply(lambda x: sum([uni_lst[i].get(wrd)/len(uni_lst[i].keys()) if uni_lst[i].get(wrd)!=None else 0 for wrd in nltk.word_tokenize(x)]))
for i,e in enumerate(emotions):
    new_df["bi_"+e]=train_df["processed"].apply(lambda x: sum([bi_lst[i].get(tpl)/len(bi_lst[i].keys()) if bi_lst[i].get(tpl)!=None else 0 for tpl in nltk.bigrams(nltk.word_tokenize(x))]))

train_df['unigram_freq'] = new_df.iloc[:,0:8].values.tolist()
train_df['bigram_freq'] = new_df.iloc[:,8:16].values.tolist()
train_df

anger
anticipation
disgust
fear
joy
sadness
surprise
trust


Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq
0,worry payment problem may never joyce meyer mo...,0,1,0,0,1,0,0,1,"[0.0, 1.0, 0.0, 0.7713618662723492, 0.48444064...","[0.018260095011876483, 0.04185407296351824, 0....","[0.0, 0.0006702885768715163, 0.0, 0.0, 0.00072..."
1,whatever decide make sure make happy,0,1,0,0,1,0,0,1,"[0.0, 0.8696868008948546, 0.125, 0.0, 1.0, 0.0...","[0.03592636579572447, 0.07083958020989506, 0.0...","[4.5479352374022196e-05, 0.0008114019614760459..."
2,help drowning thoughts,0,0,0,1,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0031175771971496437, 0.0074962518740629685,...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 9...."
3,help brother drowning,0,0,0,1,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0056413301662707836, 0.008870564717641178, ...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 0...."
4,also help majority nfl coaching inept bill bri...,1,1,1,0,1,0,0,0,"[1.0, 0.0, 0.9595457771469126, 0.0, 0.69720597...","[0.01766627078384798, 0.020239880059970013, 0....","[0.0010460251046025106, 0.0003880618076624567,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
11111,feel like people life very supportive others n...,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.08521377672209025, 0.08320839580209895, 0.0...","[0.0005912315808622885, 0.0009877936922317082,..."
11112,feel loyal sen,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.6251428571428572, 0.84857142...","[0.008165083135391923, 0.013868065967016492, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00020641..."
11113,feel complicit supporting owning copy,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.008461995249406176, 0.014492753623188406, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00041283..."
11114,really feel like supporting helping,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.05151425178147268, 0.05222388805597201, 0.0...","[0.0004547935237402219, 0.0008466803076271784,..."


In [17]:
new_dev_df = pd.DataFrame()

#compute average frequency distribution of tweet to each emotion for both unigram and bigram (Training)
for i,e in enumerate(emotions):
    new_dev_df["freq_"+e]=dev_df["processed"].apply(lambda x: sum([uni_lst[i].get(wrd)/len(uni_lst[i].keys()) if uni_lst[i].get(wrd)!=None else 0 for wrd in nltk.word_tokenize(x)]))
for i,e in enumerate(emotions):
    new_dev_df["bi_"+e]=dev_df["processed"].apply(lambda x: sum([bi_lst[i].get(tpl)/len(bi_lst[i].keys()) if bi_lst[i].get(tpl)!=None else 0 for tpl in nltk.bigrams(nltk.word_tokenize(x))]))

dev_df['unigram_freq'] = new_dev_df.iloc[:,0:8].values.tolist()
dev_df['bigram_freq'] = new_dev_df.iloc[:,8:16].values.tolist()
dev_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq
0,oh hidden revenge anger rememberthe time rebutted,1,0,1,0,0,0,0,0,"[1.0, 0.9608745684695051, 0.0625, 0.5389096662...","[0.045724465558194774, 0.034482758620689655, 0...","[0.00027287611424413316, 0.0, 0.00026618162459..."
1,make sure smiling brother,0,1,0,0,1,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]","[0.020190023752969122, 0.03473263368315842, 0....","[4.5479352374022196e-05, 0.0003527834615113243..."
2,not teamchristine tana done provoke tweeting s...,1,0,1,0,0,0,0,0,"[1.0, 0.0, 0.7194800908704215, 0.9595351087790...","[0.029097387173396674, 0.015242378810594702, 0...","[0.0005912315808622885, 0.0, 0.000621090457388..."
3,great start beginner jump auto trading profita...,0,1,0,0,1,0,0,0,"[0.0, 0.3333333333333333, 0.0, 0.0, 0.66666666...","[0.022416864608076008, 0.04085457271364317, 0....","[4.5479352374022196e-05, 0.0006350102307203836..."
4,best friend driving first time car terrifying,0,0,0,1,0,0,0,0,"[0.0, 1.0, 0.0, 0.0, 0.8405714285714286, 0.0, ...","[0.030136579572446556, 0.042728635682158914, 0...","[0.00018191740949608878, 0.000564453538418119,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
881,not make so angry laughing tweet,1,0,1,0,0,0,0,0,"[1.0, 0.0, 0.805372807017544, 0.0, 0.666666666...","[0.041716152019002375, 0.03310844577711144, 0....","[9.095870474804439e-05, 0.0, 8.872720819839404..."
882,excited watch stateoforigin tonight come nsw o...,0,1,0,0,1,0,0,0,"[0.0, 1.0, 0.0, 0.8138722709826243, 0.48897256...","[0.016033254156769598, 0.03885557221389305, 0....","[9.095870474804439e-05, 0.000493896846115854, ..."
883,blah blah blah kyrie etc leaving boston real n...,1,0,1,0,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.01232185273159145, 0.0079960019990005, 0.01...","[0.0006367109332363106, 0.0, 0.000621090457388..."
884,things learned wise shepherd never trust flock...,0,0,0,0,0,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.2967032967032967, 0.0, ...","[0.03681710213776722, 0.05147426286856572, 0.0...","[0.000227396761870111, 0.00014111338460452973,..."


In [18]:
#load nrc emotion lexicon
filepath = "data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t', keep_default_na=False)
emolex_df
# emolex_df['emotion']

Unnamed: 0,word,emotion,association
0,aback,anger,0
1,aback,anticipation,0
2,aback,disgust,0
3,aback,fear,0
4,aback,joy,0
...,...,...,...
141551,zoom,negative,0
141552,zoom,positive,0
141553,zoom,sadness,0
141554,zoom,surprise,0


In [19]:
train_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq
0,worry payment problem may never joyce meyer mo...,0,1,0,0,1,0,0,1,"[0.0, 1.0, 0.0, 0.7713618662723492, 0.48444064...","[0.018260095011876483, 0.04185407296351824, 0....","[0.0, 0.0006702885768715163, 0.0, 0.0, 0.00072..."
1,whatever decide make sure make happy,0,1,0,0,1,0,0,1,"[0.0, 0.8696868008948546, 0.125, 0.0, 1.0, 0.0...","[0.03592636579572447, 0.07083958020989506, 0.0...","[4.5479352374022196e-05, 0.0008114019614760459..."
2,help drowning thoughts,0,0,0,1,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0031175771971496437, 0.0074962518740629685,...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 9...."
3,help brother drowning,0,0,0,1,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0056413301662707836, 0.008870564717641178, ...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 0...."
4,also help majority nfl coaching inept bill bri...,1,1,1,0,1,0,0,0,"[1.0, 0.0, 0.9595457771469126, 0.0, 0.69720597...","[0.01766627078384798, 0.020239880059970013, 0....","[0.0010460251046025106, 0.0003880618076624567,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
11111,feel like people life very supportive others n...,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.08521377672209025, 0.08320839580209895, 0.0...","[0.0005912315808622885, 0.0009877936922317082,..."
11112,feel loyal sen,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.6251428571428572, 0.84857142...","[0.008165083135391923, 0.013868065967016492, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00020641..."
11113,feel complicit supporting owning copy,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.008461995249406176, 0.014492753623188406, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00041283..."
11114,really feel like supporting helping,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.05151425178147268, 0.05222388805597201, 0.0...","[0.0004547935237402219, 0.0008466803076271784,..."


In [20]:
def calculate_emotion_score(words, emotion):
    # Create a dictionary to map words to their scores
    word_scores = emotionlex_df.set_index('word')[emotion].to_dict()
    # Use a list comprehension to get the scores for the words, defaulting to 0 if not found
    scores = [word_scores.get(word, 0) for word in words]
    # Calculate the mean score
    mean_score = np.mean(scores)
    return mean_score

def calculate_emolex(text):
    # Split the text into words
    words = text.split()
    # Define the emotions to calculate
    emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
    # Calculate the mean emotion score for each emotion and store them in a list
    scores = [calculate_emotion_score(words, emotion) for emotion in emotions]
    # Return the list of scores
    return scores

text = "worry payment problem may never joyce meyer motivation leadership worry"
print(calculate_emolex(text))

[0.0, 0.4, 0.0, 0.3, 0.2, 0.3, 0.0, 0.2]


In [21]:
filepath = "data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emolex_dataframe = pd.read_csv(filepath,  names=["word", "emotion", "association"], sep='\t', keep_default_na=False)
emotion_scores = emolex_dataframe.set_index(['emotion', 'word'])['association'].to_dict()

def get_emolex(text):
    # processed_text = preprocess_text(text)
    emotion_score = np.array([])
    # Filter out 'positive' and 'negative' emotions
    filtered_emotions = emolex_dataframe['emotion'].unique()[~np.isin(emolex_dataframe['emotion'].unique(), ['positive', 'negative'])]
    for e in filtered_emotions:
        score = np.mean([emotion_scores.get((e, wrd), 0) for wrd in text.split(" ")])
        # Store the average emotion scores in the dictionary
        emotion_score = np.append(emotion_score, score)
    return emotion_score

text = "worry payment problem may never joyce meyer motivation leadership worry"
get_emolex(text)

array([0. , 0.4, 0. , 0.3, 0.2, 0.3, 0. , 0.1])

In [22]:
train_df['emolex'] = train_df['processed'].apply(get_emolex)
train_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq,emolex
0,worry payment problem may never joyce meyer mo...,0,1,0,0,1,0,0,1,"[0.0, 1.0, 0.0, 0.7713618662723492, 0.48444064...","[0.018260095011876483, 0.04185407296351824, 0....","[0.0, 0.0006702885768715163, 0.0, 0.0, 0.00072...","[0.0, 0.4, 0.0, 0.3, 0.2, 0.3, 0.0, 0.1]"
1,whatever decide make sure make happy,0,1,0,0,1,0,0,1,"[0.0, 0.8696868008948546, 0.125, 0.0, 1.0, 0.0...","[0.03592636579572447, 0.07083958020989506, 0.0...","[4.5479352374022196e-05, 0.0008114019614760459...","[0.0, 0.16666666666666666, 0.0, 0.0, 0.1666666..."
2,help drowning thoughts,0,0,0,1,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0031175771971496437, 0.0074962518740629685,...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 9....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,help brother drowning,0,0,0,1,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0056413301662707836, 0.008870564717641178, ...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33333333..."
4,also help majority nfl coaching inept bill bri...,1,1,1,0,1,0,0,0,"[1.0, 0.0, 0.9595457771469126, 0.0, 0.69720597...","[0.01766627078384798, 0.020239880059970013, 0....","[0.0010460251046025106, 0.0003880618076624567,...","[0.08333333333333333, 0.0, 0.08333333333333333..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11111,feel like people life very supportive others n...,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.08521377672209025, 0.08320839580209895, 0.0...","[0.0005912315808622885, 0.0009877936922317082,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
11112,feel loyal sen,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.6251428571428572, 0.84857142...","[0.008165083135391923, 0.013868065967016492, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00020641...","[0.0, 0.0, 0.0, 0.3333333333333333, 0.33333333..."
11113,feel complicit supporting owning copy,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.008461995249406176, 0.014492753623188406, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00041283...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2]"
11114,really feel like supporting helping,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.05151425178147268, 0.05222388805597201, 0.0...","[0.0004547935237402219, 0.0008466803076271784,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2]"


In [23]:
dev_df['emolex'] = dev_df['processed'].apply(get_emolex)
dev_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq,emolex
0,oh hidden revenge anger rememberthe time rebutted,1,0,1,0,0,0,0,0,"[1.0, 0.9608745684695051, 0.0625, 0.5389096662...","[0.045724465558194774, 0.034482758620689655, 0...","[0.00027287611424413316, 0.0, 0.00026618162459...","[0.2857142857142857, 0.2857142857142857, 0.0, ..."
1,make sure smiling brother,0,1,0,0,1,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0]","[0.020190023752969122, 0.03473263368315842, 0....","[4.5479352374022196e-05, 0.0003527834615113243...","[0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.25]"
2,not teamchristine tana done provoke tweeting s...,1,0,1,0,0,0,0,0,"[1.0, 0.0, 0.7194800908704215, 0.9595351087790...","[0.029097387173396674, 0.015242378810594702, 0...","[0.0005912315808622885, 0.0, 0.000621090457388...","[0.23076923076923078, 0.0, 0.15384615384615385..."
3,great start beginner jump auto trading profita...,0,1,0,0,1,0,0,0,"[0.0, 0.3333333333333333, 0.0, 0.0, 0.66666666...","[0.022416864608076008, 0.04085457271364317, 0....","[4.5479352374022196e-05, 0.0006350102307203836...","[0.0, 0.0625, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.125]"
4,best friend driving first time car terrifying,0,0,0,1,0,0,0,0,"[0.0, 1.0, 0.0, 0.0, 0.8405714285714286, 0.0, ...","[0.030136579572446556, 0.042728635682158914, 0...","[0.00018191740949608878, 0.000564453538418119,...","[0.0, 0.14285714285714285, 0.0, 0.0, 0.1428571..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
881,not make so angry laughing tweet,1,0,1,0,0,0,0,0,"[1.0, 0.0, 0.805372807017544, 0.0, 0.666666666...","[0.041716152019002375, 0.03310844577711144, 0....","[9.095870474804439e-05, 0.0, 8.872720819839404...","[0.16666666666666666, 0.0, 0.16666666666666666..."
882,excited watch stateoforigin tonight come nsw o...,0,1,0,0,1,0,0,0,"[0.0, 1.0, 0.0, 0.8138722709826243, 0.48897256...","[0.016033254156769598, 0.03885557221389305, 0....","[9.095870474804439e-05, 0.000493896846115854, ...","[0.0, 0.18181818181818182, 0.0, 0.181818181818..."
883,blah blah blah kyrie etc leaving boston real n...,1,0,1,0,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.01232185273159145, 0.0079960019990005, 0.01...","[0.0006367109332363106, 0.0, 0.000621090457388...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307..."
884,things learned wise shepherd never trust flock...,0,0,0,0,0,0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.2967032967032967, 0.0, ...","[0.03681710213776722, 0.05147426286856572, 0.0...","[0.000227396761870111, 0.00014111338460452973,...","[0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0,..."


In [24]:
def average_score(df):

    average_column = []
    # Iterate through each row of the DataFrame
    for i in range(len(df)):
        temp_arr = []
        for x in range(8):  # Assuming you want to calculate averages for the first 7 elements
            total = df['emotion_score'][i][x] + df['unigram_freq'][i][x] + df['bigram_freq'][i][x]
            average = total /  3
            temp_arr.append(average)
        # Convert the list of averages to a string and store it in the DataFrame
        average_column.append(np.array(normalize_array(temp_arr)))
    df['average_score'] = average_column
    
    return df

train_df = average_score(train_df)
dev_dev = average_score(dev_df)
train_df

Unnamed: 0,processed,anger,anticipation,disgust,fear,joy,sadness,surprise,trust,emotion_score,unigram_freq,bigram_freq,emolex,average_score
0,worry payment problem may never joyce meyer mo...,0,1,0,0,1,0,0,1,"[0.0, 1.0, 0.0, 0.7713618662723492, 0.48444064...","[0.018260095011876483, 0.04185407296351824, 0....","[0.0, 0.0006702885768715163, 0.0, 0.0, 0.00072...","[0.0, 0.4, 0.0, 0.3, 0.2, 0.3, 0.0, 0.1]","[0.003337919861561856, 1.0, 0.0042221967051883..."
1,whatever decide make sure make happy,0,1,0,0,1,0,0,1,"[0.0, 0.8696868008948546, 0.125, 0.0, 1.0, 0.0...","[0.03592636579572447, 0.07083958020989506, 0.0...","[4.5479352374022196e-05, 0.0008114019614760459...","[0.0, 0.16666666666666666, 0.0, 0.0, 0.1666666...","[0.0, 0.8653621347853994, 0.11912321859114586,..."
2,help drowning thoughts,0,0,0,1,0,1,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0031175771971496437, 0.0074962518740629685,...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 9....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.3987928282560208, 0.049235993287545896..."
3,help brother drowning,0,0,0,1,0,0,1,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.0056413301662707836, 0.008870564717641178, ...","[0.0, 0.0, 0.0, 8.657259111765215e-05, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.33333333...","[0.00022682348609913191, 0.0034503628649932418..."
4,also help majority nfl coaching inept bill bri...,1,1,1,0,1,0,0,0,"[1.0, 0.0, 0.9595457771469126, 0.0, 0.69720597...","[0.01766627078384798, 0.020239880059970013, 0....","[0.0010460251046025106, 0.0003880618076624567,...","[0.08333333333333333, 0.0, 0.08333333333333333...","[1.0, 0.0034733819602001752, 0.962398427893992..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11111,feel like people life very supportive others n...,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[0.08521377672209025, 0.08320839580209895, 0.0...","[0.0005912315808622885, 0.0009877936922317082,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.023674268148615767, 0.02209223324903694, 0...."
11112,feel loyal sen,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.6251428571428572, 0.84857142...","[0.008165083135391923, 0.013868065967016492, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00020641...","[0.0, 0.0, 0.0, 0.3333333333333333, 0.33333333...","[0.0, 0.005663705298531789, 0.0019181902481721..."
11113,feel complicit supporting owning copy,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.008461995249406176, 0.014492753623188406, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00041283...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2]","[0.0, 0.005978706776847102, 0.0020556652995919..."
11114,really feel like supporting helping,0,0,0,0,0,0,0,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]","[0.05151425178147268, 0.05222388805597201, 0.0...","[0.0004547935237402219, 0.0008466803076271784,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2]","[0.00368037823028463, 0.0047882480012778235, 0..."


In [25]:
check_emotion_scores(train_df['average_score'])
check_emotion_scores(dev_df['average_score'])

All emotion scores are within the range [0, 1]
All emotion scores are within the range [0, 1]


In [26]:
train_df['average_score'].iloc[0]

array([0.00333792, 1.        , 0.0042222 , 0.76656254, 0.4930689 ,
       0.81116933, 0.        , 0.47401813])

In [27]:
train_df.to_csv('processed/final-train.csv',index=False)
dev_df.to_csv('processed/final-dev.csv',index=False)