In [1]:
import pandas as pd
from random import randint
import re

import utils

# Emotion Data Prep

Source: [Kaggle Emotions Dataset for NLP](https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp/data)

In [2]:
edf = pd.read_csv('data/emotions/train.txt', sep=';', header=0, names=['text', 'emotion'])
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


## Optional: Deal with spellings like "don t"

In [9]:
# Here if we want to use it. Leaving it off for now.
def remove_contracted_terms(raw_text: str) -> str:
    sep_patterns = [
        r"[\s][^\s]+n[\s]+t[\s]",
        r"[\s][Ii][\s]+m[\s]",
        r"you[\s]+re[\s]",
        r"they[\s]+re[\s]",
        r"she[\s]+s[\s]",
        r"[\s]he[\s]+s[\s]",
        r"[\s][^\s]+[\s]+d[\s]",
        r"[\s][^\s]+[\s]+ve[\s]"
    ]
    combo_pattern = re.compile('|'.join(sep_patterns))
    return re.sub(combo_pattern, ' ', raw_text)


In [10]:
edf['contractions_removed'] = edf['text'].apply(remove_contracted_terms)
edf.sample(n=5)

Unnamed: 0,text,emotion,contractions_removed
7680,i dont recall just now yet vividly recall look...,love,i dont recall just now yet vividly recall look...
5747,i feel so embarrassed of myself for even havin...,sadness,i feel so embarrassed of myself for even havin...
4466,i would feel so excited waiting for the mailma...,joy,i would feel so excited waiting for the mailma...
14576,i growled at her i began to feel extremely ann...,anger,i growled at her i began to feel extremely ann...
4460,i am right now i feel amused the sounds i hear...,joy,i am right now i feel amused the sounds i hear...


In [14]:
edf.sample(n=5)

Unnamed: 0,text,emotion,contractions_removed
4198,i feel that learning more about animals and th...,joy,i feel that learning more about animals and th...
9552,i am feeling a bit apprehensive about carrying...,fear,i am feeling a bit apprehensive about carrying...
7054,im feeling very bitter against knight in shini...,anger,im feeling very bitter against knight in shini...
10405,i don t always remember to do this but when i ...,anger,i always remember to do this but when feeling ...
6786,i feel shes friendly and nice,joy,i feel shes friendly and nice


In [15]:
edf.drop(columns=['contractions_removed'], inplace=True)
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


## Binary Label: Negative Emotion ("Stress") as 1

In [3]:
edf['emotion'].value_counts()

emotion
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

Just going to roughly group "sadness", "anger", and "fear" into "stressed"

In [3]:
possible_stress = {'sadness', 'anger', 'fear'}
edf['label'] = edf['emotion'].apply(lambda emotion: 1 if emotion.strip() in possible_stress else 0)
edf.sample(n=3)

Unnamed: 0,text,emotion,label
3293,i feel this way is probably because i am dumb ...,sadness,1
4781,i literally just text tychelle to see if she w...,sadness,1
12567,i saw a gain on the scale this morning which d...,sadness,1


## Text Processing

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse

In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))

In [6]:
print(len(stop_words))
utils.add_stopwords_missing_apostrophe(stop_words)
print(len(stop_words))

179
205


In [7]:
edf['processed_text'] = edf['text'].apply(lambda raw_text: utils.process_text(
    text_chunk=raw_text, stopwords=stop_words, lemmatizer_obj=lemmatizer
))
edf.sample(n=5)

Unnamed: 0,text,emotion,label,processed_text
10071,i just got a whole pile of presents so im feel...,joy,0,got whole pile present im feeling generous
15283,i need to know that it can be fixed and that i...,joy,0,need know fixed going feel gorgeous dress
2997,i see my favorite person suffer and there is n...,sadness,1,see favorite person suffer nothing take pain a...
8754,i have to visit them every after school and la...,sadness,1,visit every school later go tuition time even ...
1780,i feel optimistic that he ll settle in before ...,joy,0,feel optimistic settle long arrived


## Vectorize (Word Embedding)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Experiment with Max and Min Document Frequencies for a Reasonable Vocabulary Size

In [24]:
doc_freq_maxes = [0.9, 0.8, 0.7, 0.6, 0.5]

In [27]:
for ceiling in doc_freq_maxes:
    tfidf = TfidfVectorizer(max_df=ceiling)
    tf_df = tfidf.fit_transform(edf['processed_text'])
    tf_df.toarray()
    print(f'Max doc freq: {ceiling}\nTerms: {tf_df.shape[1]}\n')

Max doc freq: 0.9
Terms: 13435

Max doc freq: 0.8
Terms: 13435

Max doc freq: 0.7
Terms: 13435

Max doc freq: 0.6
Terms: 13434

Max doc freq: 0.5
Terms: 13434



In [32]:
doc_freq_mins = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.]

In [33]:
for floor in doc_freq_mins:
    tfidf = TfidfVectorizer(min_df=floor)
    tf_df = tfidf.fit_transform(edf['processed_text'])
    tf_df.toarray()
    print(f'Min doc freq: {floor}\nTerms: {tf_df.shape[1]}\n')

Min doc freq: 0.1
Terms: 4

Min doc freq: 0.01
Terms: 101

Min doc freq: 0.001
Terms: 1363

Min doc freq: 0.0001
Terms: 6496

Min doc freq: 1e-05
Terms: 13435

Min doc freq: 0.0
Terms: 13435



### Actual Embedding

In [11]:
# Try max_df = 0.6 and min_df = 0.0001
MAX_DF = 0.6
MIN_DF = 0.0001

In [12]:
tfidf = TfidfVectorizer(min_df=MIN_DF, max_df=MAX_DF)
tf_df = tfidf.fit_transform(edf['processed_text'])
tf_df.toarray()
tf_df.shape

(15999, 6495)

In [13]:
tf_df = pd.DataFrame(tf_df.toarray(), columns=tfidf.get_feature_names_out())
tf_df.sample(n=3)

Unnamed: 0,aa,abandon,abandoned,abandoning,abandonment,abc,abdomen,abide,ability,abit,...,zach,zealand,zen,zero,zest,zombie,zone,zoom,zooming,zumba
2284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tf_df.describe()

Unnamed: 0,aa,abandon,abandoned,abandoning,abandonment,abc,abdomen,abide,ability,abit,...,zach,zealand,zen,zero,zest,zombie,zone,zoom,zooming,zumba
count,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,...,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0
mean,8.8e-05,8e-05,0.00025,7.6e-05,9.2e-05,0.000111,0.000112,8.4e-05,0.000784,0.000138,...,7.2e-05,9.6e-05,5.3e-05,0.000137,4.9e-05,0.000127,0.000239,5.1e-05,5.4e-05,6.7e-05
std,0.006486,0.006044,0.012614,0.006844,0.006779,0.007231,0.007563,0.006296,0.016766,0.009148,...,0.006537,0.006198,0.004726,0.007408,0.004508,0.007301,0.00974,0.004746,0.005011,0.004937
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.521035,0.581781,0.818975,0.66778,0.586314,0.610155,0.729999,0.594738,0.541364,0.77192,...,0.676502,0.491465,0.430916,0.594343,0.48805,0.492643,0.513285,0.531097,0.552705,0.384099


In [41]:
# Either run this cell or the next, NOT both - this is for reproducible shuffling of train/test data
random_seed = randint(0, 50)
random_seed

5

In [14]:
# Either run this cell or the above, NOT both
random_seed = 5

In [15]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    tf_df, edf['label'], test_size=0.2, random_state=random_seed, stratify=edf['label']
)
X_train.shape, y_train.shape

((12799, 6495), (12799,))

In [22]:
X_test.shape, y_test.shape

((3200, 6495), (3200,))

# ML Models

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

## Logistic Regression

In [46]:
lr_model = LogisticRegression().fit(X_train, y_train)
lr_model.score(X_test, y_test)

0.95375

## Naive Bayes

In [47]:
nb_model = MultinomialNB().fit(X_train, y_train)
nb_model.score(X_test, y_test)

0.933125

## Random Forest

In [48]:
rf_model = RandomForestClassifier().fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.940625

# DL Models

In [18]:
import tensorflow as tf

2024-03-05 20:27:26.128614: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 20:27:26.331778: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 20:27:26.331836: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 20:27:26.332588: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-05 20:27:26.414733: I tensorflow/core/platform/cpu_feature_g

In [25]:
BATCH_SIZE = 32
EPOCHS = 20

## RNN

Adapted from [TensorFlow Text Classification RNN Tutorial](https://www.tensorflow.org/text/tutorials/text_classification_rnn)

In [19]:
DIM_NUM = 64

In [23]:
rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=X_train.shape[1],
        output_dim=DIM_NUM,
        mask_zero=True), # Masking to handel variable sequence lengths? Is this even necessary?
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(DIM_NUM)),
    tf.keras.layers.Dense(DIM_NUM, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
single_layer_history = rnn_model.fit(
    X_train, y_train, validation_data=(X_test, y_test),
    epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=callback
)

RuntimeError: Data adapters should be mutually exclusive for handling inputs. Found multiple adapters [<class 'keras.src.engine.data_adapter.TensorLikeDataAdapter'>, <class 'keras.src.engine.data_adapter.GeneratorDataAdapter'>] to handle input: <class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>

## LSTM