#Imports

In [None]:
import pandas as pd
pd.set_option("max_colwidth", 160)

import numpy as np
from sklearn.dummy import DummyClassifier
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

from keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers
from keras.models import Sequential
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.style.use("seaborn-talk")

In [None]:
def load_data(file_path, indx = True, indx_col = 0):
  '''Parameters:
  file_path: path to your excel or csv file with data,

  indx: boolean - whether there is index column in your file (usually it is the first column) --> *by default it is set to True
  
  indx_col: int - if your file has an index column, specify column number here --> *by default it is equal to 0 (first column)
  '''
  if indx == True and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".xlsx"):
    data = pd.read_excel(file_path)

  elif indx == True and file_path.endswith(".csv"):
    data = pd.read_csv(file_path, index_col = indx_col)
  elif indx == False and file_path.endswith(".csv"):
    data = pd.read_csv(file_path)

  return data

In [None]:
def clean_text2(dataframe, text_column):
  import re
  import string
  df = dataframe.copy()
  all_texts = []
  for text in df[text_column]:
    text = re.sub(r"(http|https):\/\/([\w\s\d\.]+)(\/?)(.*)", " ", str(text).lower()) #  urls
    text = re.sub(r"(www).([\w\s\d\.]+)(\/?)(.*)", " ", text) #  urls
    text = re.sub('@[\w\d]+',' ', text)  # mentions
    text = text.replace("\n", " ") # new lines
    text = re.sub(r'\B#\w*[a-zA-Z0-9]+\w*',' ', text) # hashtags
    text = text.strip()
    all_texts.append(text)
  df["clean_" + text_column] = all_texts
  return df

In [None]:
!pip install laserembeddings
!python -m laserembeddings download-models

from laserembeddings import Laser
laser = Laser()

#Collecting data

In [None]:
#!pip install datasets
from datasets import list_datasets, load_dataset

# CARER dataset from: https://github.com/dair-ai/emotion_dataset
# paper: https://aclanthology.org/D18-1404/

dataset = load_dataset('emotion', split='train+test+validation')

In [None]:
map_emo = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}

dataset_df = pd.DataFrame(dataset)

dataset_df["emotion"] = dataset_df.label.map(map_emo)
dataset_df.head()

Unnamed: 0,text,label,emotion
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,2,love
4,i am feeling grouchy,3,anger


In [None]:
print(dataset_df.emotion.value_counts())
dataset_df.shape

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: emotion, dtype: int64


(20000, 3)

In [None]:
dataset_df.to_excel("CARER_emotion_dataset.xlsx")

In [None]:
# GoEmotions dataset from: https://github.com/google-research/google-research/tree/master/goemotions
# paper: https://arxiv.org/abs/2005.00547

goemo = pd.read_csv("GoEmotions/GoEmotions dataset_train.csv", 
                      header=None, engine="python", sep="\t")
print(goemo.shape)
goemo.head(2)

In [None]:
goemo.columns = ['text', 'label', 'annotator']
goemo = goemo[['text', 'label']]

In [None]:
emo_num = ['3', '12', '15', '18', '26', '27', '28']
map_go_emos = {3:'anger', 12:'disgust', 15:'fear', 18:'joy', 26:'sadness', 27:'surprise', 28:'neutral'}
goemo = goemo[goemo.label.isin(emo_num)]

In [None]:
goemo["label"] = goemo.label.apply(lambda x: int(x))

In [None]:
goemo["emotion"] = goemo.label.map(map_go_emos)
goemo.head()

Unnamed: 0,text,label,emotion
0,My favourite food is anything I didn't have to cook myself.,27,surprise
1,"Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead",27,surprise
4,Dirty Southern Wankers,3,anger
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe PlAyOfFs! Dumbass Broncos fans circa December 2015.,26,sadness
6,Yes I heard abt the f bombs! That has to be why. Thanks for your reply:) until then hubby and I will anxiously wait 😝,15,fear


In [None]:
print(goemo.emotion.value_counts())
goemo.shape

surprise    12823
fear         1857
anger        1451
joy          1427
sadness       720
disgust       203
Name: emotion, dtype: int64


(18481, 3)

In [None]:
goemo.to_excel("GOemo_emotion_dataset.xlsx")

In [None]:
# dataset from SemEval 2018: E-c  and EI-reg  subtasks datasets
# paper: https://aclanthology.org/S18-1001.pdf

sem = pd.read_csv("Sem_Eval2018_categorization/2018_train.txt", 
                      engine="python", sep="\t")
sem.head(1)

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may never have'. Joyce Meyer. #motivation #leadership #worry,0,1,0,0,0,0,1,0,0,0,1


In [None]:
sem = sem[sem.love == 0]
sem.shape

In [None]:
sem_neu = sem[~sem.iloc[:, 2:].any(axis=1)]
sem_emo = sem[sem.iloc[:, 2:].any(axis=1)]
sem_emo['neutral'] = 0
sem_neu['neutral'] = 1
sem_emo.shape, sem_neu.shape, sem.shape, sem_emo.shape[0] + sem_neu.shape[0]

In [None]:
sem1 = pd.concat([sem_emo, sem_neu], axis=0)
sem1 = sem1[['Tweet', 'anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']]
print(sem1.shape)
sem1.head(3)

(6138, 8)


Unnamed: 0,Tweet,anger,disgust,fear,joy,sadness,surprise,neutral
2,"@Max_Kellerman it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS",1,1,0,1,0,0,0
3,Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶,0,0,0,1,0,0,0
4,My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs,1,1,0,0,0,0,0


In [None]:
sem_melt = sem1.melt("Tweet", var_name="emotion", value_name='value')
sem_melt = sem_melt[sem_melt.value == 1]
print(sem_melt.shape)
sem_melt.head()

(10943, 3)


Unnamed: 0,Tweet,emotion,value
0,"@Max_Kellerman it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS",anger,1
2,My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs,anger,1
4,"Rooneys fucking untouchable isn't he? Been fucking dreadful again, depay has looked decent(ish)tonight",anger,1
6,@BossUpJaee but your pussy was weak from what I heard so stfu up to me bitch . You got to threaten him that your pregnant .,anger,1
8,S/O to the girl that just hit my car...not only did she get lucky w/ no scratch but also from being spared the wrath of sleep deprived Kait🙃,anger,1


In [None]:
sem_melt.emotion.value_counts()

disgust     2587
anger       2527
sadness     1960
joy         1822
fear        1222
neutral      492
surprise     333
Name: emotion, dtype: int64

In [None]:
map_emo_sem = {'anger': 3,
 'disgust': 12,
 'fear': 15,
 'joy': 18,
 'neutral': 28,
 'sadness': 26,
 'surprise': 27}
 
sem_melt = sem_melt.iloc[:, :-1]
sem_melt["label"] = sem_melt.emotion.map(map_emo_sem)
sem_melt.head()

Unnamed: 0,Tweet,emotion,label
0,"@Max_Kellerman it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS",anger,3
2,My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs,anger,3
4,"Rooneys fucking untouchable isn't he? Been fucking dreadful again, depay has looked decent(ish)tonight",anger,3
6,@BossUpJaee but your pussy was weak from what I heard so stfu up to me bitch . You got to threaten him that your pregnant .,anger,3
8,S/O to the girl that just hit my car...not only did she get lucky w/ no scratch but also from being spared the wrath of sleep deprived Kait🙃,anger,3


In [None]:
#sem_merge = sem_melt.copy()
sem_merge1 = pd.concat([sem_merge1, sem_melt], axis=0)
sem_merge1.shape

(17124, 3)

In [None]:
sem_merge1.columns = ['text', 'emotion', 'label']
sem_merge1.reset_index(inplace=True, drop=True)
sem_merge1.head(1)

Unnamed: 0,text,emotion,label
0,"@RanaAyyub @rajnathsingh Oh, hidden revenge and anger...I rememberthe time,she rebutted you.",anger,3


In [None]:
sem_merge1.to_excel("SEMEVAL18_emo_cat_dataset.xlsx")

In [None]:
# reg task sets
sem = pd.read_csv("Sem_Eval2018_categorization/semeval_train_reg_4emot.txt", 
                      engine="python", sep="\t")
print(sem.shape)
sem.head(3)

(7102, 4)


Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Score
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are cool #offended,anger,0.562
1,2017-En-10072,it makes me so fucking irate jesus. nobody is calling ppl who like hajime abusive stop with the strawmen lmao,anger,0.75
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,0.417


In [None]:
sem["Affect Dimension"].value_counts()

fear       2252
anger      1701
joy        1616
sadness    1533
Name: Affect Dimension, dtype: int64

In [None]:
sem = sem.iloc[:, 1:-1]
sem.columns = ["text", "emotion"]
sem["label"] = sem.emotion.map(map_emo_sem)
sem.head()

Unnamed: 0,text,emotion,label
0,@xandraaa5 @amayaallyn6 shut up hashtags are cool #offended,anger,3
1,it makes me so fucking irate jesus. nobody is calling ppl who like hajime abusive stop with the strawmen lmao,anger,3
2,Lol Adam the Bull with his fake outrage...,anger,3
3,@THATSSHAWTYLO passed away early this morning in a fast and furious styled car crash as he was leaving an ATL strip club. That's rough stuff,anger,3
4,@Kristiann1125 lol wow i was gonna say really?! haha have you seen chris or nah? you dont even snap me anymore dude!,anger,3


In [None]:
sem_reg_test = load_data("/SEMEVAL18_emo_reg_dataset.xlsx")
print(sem_reg_test.shape)
sem_reg_test.head(1)

(4068, 3)


Unnamed: 0,text,emotion,label
0,@PageShhh1 I know you mean well but I'm offended. Prick.,anger,3


In [None]:
#sem_merge = sem.copy()
sem_merge1 = pd.concat([sem_merge, sem], axis=0)
sem_merge1.shape

(8566, 3)

In [None]:
sem_merge1.columns = ['text', 'emotion', 'label']
sem_merge1.head(1)

Unnamed: 0,text,emotion,label
0,'we need to do something. something must be done!!!!!'\n\nyour anxiety is amusing. nothing will be done. despair.,anger,3


In [None]:
sem_merge2 = pd.concat([sem_reg_test, sem_merge1], axis=0)
sem_merge2.reset_index(drop=True, inplace=True)
print(sem_merge2.shape)
sem_merge2.head(1)

(12634, 3)


Unnamed: 0,text,emotion,label
0,@PageShhh1 I know you mean well but I'm offended. Prick.,anger,3


In [None]:
sem_merge2.to_excel("/SEMEVAL18_emo_reg_dataset.xlsx")

In [None]:
goemo = goemo[['text', 'emotion', 'label']]

In [None]:
semeval_18 = pd.concat([sem_melt_all, sem], axis=0)
print(semeval_18.shape)
semeval_18.head()

(21251, 3)


Unnamed: 0,text,emotion,label
0,@Adnan__786__ @AsYouNotWish Dont worry Indian army is on its ways to dispatch all Terrorists to Hell,anger,3
2,I blew that opportunity -__- #mad,anger,3
5,"@POLITICOEurope Interesting choice of words... Are you confirming that governments fund #terrorism? Bit of an open door, but still...",anger,3
13,"Bitter afternoon,no sweetness at all #Lenovo",anger,3
17,i am revolting.,anger,3


In [None]:
dataset1 = pd.concat([semeval_18, goemo], axis=0)
print(dataset1.shape)
dataset1.head()

(39732, 3)


Unnamed: 0,text,emotion,label
0,@Adnan__786__ @AsYouNotWish Dont worry Indian army is on its ways to dispatch all Terrorists to Hell,anger,3
2,I blew that opportunity -__- #mad,anger,3
5,"@POLITICOEurope Interesting choice of words... Are you confirming that governments fund #terrorism? Bit of an open door, but still...",anger,3
13,"Bitter afternoon,no sweetness at all #Lenovo",anger,3
17,i am revolting.,anger,3


In [None]:
dataset_df = dataset_df[dataset_df.emotion != 'love'] # CARER data

In [None]:
map_emo_sem

{'anger': 3,
 'disgust': 12,
 'fear': 15,
 'joy': 18,
 'neutral': 28,
 'sadness': 26,
 'surprise': 27}

In [None]:
dataset_df["label"] = dataset_df.emotion.map(map_emo_sem)
dataset_df.head()

Unnamed: 0,text,emotion,label
0,i didnt feel humiliated,sadness,26
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness,26
2,im grabbing a minute to post i feel greedy wrong,anger,3
4,i am feeling grouchy,anger,3
5,ive been feeling a little burdened lately wasnt sure why that was,sadness,26


In [None]:
all_data = pd.concat([dataset1, dataset_df], axis=0)
print(all_data.shape)
all_data.head()

(58091, 3)


Unnamed: 0,text,emotion,label
0,@Adnan__786__ @AsYouNotWish Dont worry Indian army is on its ways to dispatch all Terrorists to Hell,anger,3
2,I blew that opportunity -__- #mad,anger,3
5,"@POLITICOEurope Interesting choice of words... Are you confirming that governments fund #terrorism? Bit of an open door, but still...",anger,3
13,"Bitter afternoon,no sweetness at all #Lenovo",anger,3
17,i am revolting.,anger,3


In [None]:
##################    emotions we have   ##################

# original mapping
map_go_emos = {3:'anger', 12:'disgust', 15:'fear', 18:'joy', 
               26:'sadness', 27:'surprise', 28:'neutral'}


# new mapping --> our custom
map_new_labels = {'neutral': 0, 'anger': 1, 'disgust': 2, 
                  'fear': 3, 'joy': 4, 'sadness': 5, 'surprise': 6}

In [None]:
all_data["new_label"] = all_data.emotion.map(map_new_labels)
all_data.tail(3)

Unnamed: 0,text,emotion,label,new_label
19997,i feel its important to share this info for those that experience the same thing,joy,18,4
19998,i truly feel that if you are passionate enough about something and stay true to yourself you will succeed,joy,18,4
19999,i feel like i just wanna buy any cute make up i see online or even the one,joy,18,4


In [None]:
all_data.emotion.value_counts()

(66598, 3)


joy         14253
surprise    14044
sadness     12561
anger       11174
fear         9672
disgust      4194
neutral       700
Name: emotion, dtype: int64

In [None]:
all_data["new_label"] = all_data.emotion.map(map_new_labels)

In [None]:
all_data = clean_text2(all_data, "text")
all_data.tail(3)

Unnamed: 0,text,emotion,label,new_label,clean_text
66595,#vinb I'm alot more interested in hearing bout differences between parties political ideologies. Parties murky origins r irrefutable #vinb,sadness,26,5,i'm alot more interested in hearing bout differences between parties political ideologies. parties murky origins r irrefutable
66596,Overwhelming sadness. This too shall pass. #lost #lonley #startingover,sadness,26,5,overwhelming sadness. this too shall pass.
66597,Idk why people be glorifying depression. I wouldn't wish real depression upon my worst enemy. Shits the worst stop acting like it's cool,sadness,26,5,idk why people be glorifying depression. i wouldn't wish real depression upon my worst enemy. shits the worst stop acting like it's cool


In [None]:
all_data.new_label.value_counts()

4    14253
6    14044
5    12561
1    11174
3     9672
2     4194
0      700
Name: new_label, dtype: int64

In [None]:
all_data.to_excel("/NEW_CROSS_lang_emotion_dataset_all.xlsx")

#**Training cross-ling. model with LASER embeddings**

In [None]:
data_emotions.drop_duplicates(["text", "label"], inplace=True)

In [None]:
x = data["clean_text"].apply(str).values
y = data["label"].values

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
dummy_y = np_utils.to_categorical(encoded_Y)

In [None]:
x.shape, dummy_y.shape

((58091,), (58091, 7))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, dummy_y, test_size = 0.15, random_state = 3)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=3)
print(x_train.shape, x_valid.shape, x_test.shape)
x_train

(44439,) (4938,) (8714,)


array(["i think they're getting desperate from all of us cord cutters converting everyone to hulu.",
       "ball watching &amp; rojo'd header was equally dreadful!!",
       'this is more like yesyesyesyesmeh.', ..., 'luv ya too buddy 😝',
       'i really feel like i am very eager to destroy someones life and yet i always want to help everyone around me',
       'u got to b kidding me. anu from your firm responded  when i sent the contact details.'],
      dtype=object)

In [None]:
x_train = laser.embed_sentences(x_train, lang='en')
x_test = laser.embed_sentences(x_test, lang='en')
x_valid = laser.embed_sentences(x_valid, lang='en')

In [None]:
dummy_cls_frq = DummyClassifier(strategy = "stratified", random_state = 3)
dummy_cls_frq.fit(x_train, y_train)
dummy_score_frq = dummy_cls_frq.score(x_test, y_test).round(2)
dummy_score_frq

0.06

In [None]:
stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss', verbose = 0, patience = 5
)

In [None]:
model1 = Sequential()
model1.add(Dense(1024, input_shape=(x_train.shape[1],), activation='relu'))
model1.add(Dense(512, activation='relu')) 
model1.add(Dense(256, activation='relu')) 
model1.add(Dense(128, activation='relu'))
model1.add(layers.Dropout(0.4))
model1.add(Dense(7, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(learning_rate=15e-5), 
               metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])
history1 = model1.fit(x_train, y_train, epochs=50, batch_size=64, validation_data=(x_valid, y_valid), 
                      verbose=1, callbacks=[stopping])

In [None]:
model1.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 1024)              1049600   
                                                                 
 dense_11 (Dense)            (None, 512)               524800    
                                                                 
 dense_12 (Dense)            (None, 256)               131328    
                                                                 
 dense_13 (Dense)            (None, 128)               32896     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_14 (Dense)            (None, 7)                 903       
                                                                 
Total params: 1,739,527
Trainable params: 1,739,527
No

In [None]:
results10 = model10.predict(x = x_test, batch_size=64, verbose=0)
predictions10 = np.argmax(results10, axis=1)

y_test_labels = np.argmax(y_test, axis=1)
print(metrics.classification_report(y_test_labels, predictions10)) 
print("\n\n")
metrics.f1_score(y_test_labels, predictions10, average="micro").round(3), metrics.f1_score(y_test_labels, predictions10, average="macro").round(3)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        92
           1       0.40      0.56      0.46      1333
           2       0.21      0.03      0.05       632
           3       0.59      0.48      0.53      1016
           4       0.65      0.78      0.71      1995
           5       0.52      0.49      0.50      1549
           6       0.75      0.74      0.74      2097

    accuracy                           0.59      8714
   macro avg       0.44      0.44      0.43      8714
weighted avg       0.57      0.59      0.57      8714






(0.586, 0.427)

In [None]:
(746 + 19 + 488 + 1556 + 759 + 1552) / 8622 

# micro-f1 w/o label 0   macro-f1 0,498 w/o 0 label

0.5938297378798423

In [None]:
model1.save("PaREMO_model.h5")

#**Predict expressed emotions**

In [None]:
model1 = keras.models.load_model("PaREMO_model.h5")

In [None]:
df = load_data("tweet_Czerwiec.xlsx")
print(df.shape)
df.head(1)

In [None]:
x_df = laser.embed_sentences(df.clean_Tekst.values, lang='pl')
x_df.shape

(35360, 1024)

In [None]:
pred_labels = model1.predict(x = x_df, batch_size=128, verbose=0)
pred_labels = np.argmax(pred_labels, axis=1)
pred_labels.shape

(35360,)

In [None]:
map_new_labels_reversed = {0: 'neutral', 1: 'anger', 2: 'disgust', 
                           3: 'fear', 4: 'joy', 5: 'sadness', 6: 'surprise'}

In [None]:
df["predicted_emotion"] = pred_labels
df["predicted_emotion_labels"] = df["predicted_emotion"].map(map_new_labels_reversed)

In [None]:
df.to_excel("tweet_Czerwiec_expressed_emotions.xlsx")