In [1]:
import gc
import pandas as pd
from random import randint
import re

import utils

# Emotion Data Prep

Source: [Kaggle Emotions Dataset for NLP](https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp/data)

## Shuffle and Set Aside 20% of Training Data as New Validation Set

In [37]:
edf = pd.read_csv('data/emotions/train.txt', sep=';', header=0, names=['text', 'emotion'])
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [None]:
# Either run this cell or the next, NOT both - this is for reproducible shuffling of train/test data
random_seed = randint(0, 50)
random_seed

In [14]:
# Either run this cell or the above, NOT both - using same random seed as sklearn train-test thouogh results probably not the same
random_seed = 5

In [18]:
edf.head(2)

Unnamed: 0,text,emotion
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger


In [38]:
edf = edf.sample(frac=1, random_state=random_seed)
edf.head(3)

Unnamed: 0,text,emotion
1801,i still feel completely accepted,joy
11253,i still feel guilty to this day for taking a spot,sadness
13867,i am however caught by the feeling that i miss...,sadness


In [27]:
val_size = round(len(edf) * 0.2)

In [39]:
validation_df = edf[:val_size].copy()
validation_df.head(3)

Unnamed: 0,text,emotion
1801,i still feel completely accepted,joy
11253,i still feel guilty to this day for taking a spot,sadness
13867,i am however caught by the feeling that i miss...,sadness


In [40]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3200 entries, 1801 to 6589
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     3200 non-null   object
 1   emotion  3200 non-null   object
dtypes: object(2)
memory usage: 75.0+ KB


In [41]:
validation_df.tail(2)

Unnamed: 0,text,emotion
5880,i actually read it im left feeling disillusion...,sadness
6589,i feel a strong sense of relief,joy


In [42]:
train_df = edf[val_size:].copy()
train_df.head(2)

Unnamed: 0,text,emotion
3980,i feel drastically inadequate for the needs i ...,sadness
6351,i dont want to put to much pressure on myself ...,surprise


In [43]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12799 entries, 3980 to 2915
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     12799 non-null  object
 1   emotion  12799 non-null  object
dtypes: object(2)
memory usage: 300.0+ KB


In [44]:
len(validation_df) + len(train_df) == len(edf)

True

In [45]:
# DO NOT run this cell if doing the ML algorithms - keep edf since sklearn takes care of train-vavlidation split
del edf 

gc.collect()

105

## Combine Provided Validation and Test Sets into New Test Set with Almost 4K Elements

In [3]:
vdf = pd.read_csv('data/emotions/val.txt', sep=';', header=0, names=['text', 'emotion'])
vdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     1999 non-null   object
 1   emotion  1999 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [10]:
tdf = pd.read_csv('data/emotions/test.txt', sep=';', header=0, names=['text', 'emotion'])
tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     1999 non-null   object
 1   emotion  1999 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [11]:
tdf = pd.concat([tdf, vdf]).reset_index()
tdf.drop(columns=['index'], inplace=True)
tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3998 entries, 0 to 3997
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   index    3998 non-null   int64 
 1   text     3998 non-null   object
 2   emotion  3998 non-null   object
dtypes: int64(1), object(2)
memory usage: 93.8+ KB


In [46]:
tdf.tail(3)

Unnamed: 0,text,emotion,label
3995,i feel its important to share this info for th...,joy,0
3996,i truly feel that if you are passionate enough...,joy,0
3997,i feel like i just wanna buy any cute make up ...,joy,0


In [34]:
del vdf
gc.collect()

0

## Optional: Deal with spellings like "don t"

In [9]:
# Here if we want to use it. Leaving it off for now.
def remove_contracted_terms(raw_text: str) -> str:
    sep_patterns = [
        r"[\s][^\s]+n[\s]+t[\s]",
        r"[\s][Ii][\s]+m[\s]",
        r"you[\s]+re[\s]",
        r"they[\s]+re[\s]",
        r"she[\s]+s[\s]",
        r"[\s]he[\s]+s[\s]",
        r"[\s][^\s]+[\s]+d[\s]",
        r"[\s][^\s]+[\s]+ve[\s]"
    ]
    combo_pattern = re.compile('|'.join(sep_patterns))
    return re.sub(combo_pattern, ' ', raw_text)


In [10]:
edf['contractions_removed'] = edf['text'].apply(remove_contracted_terms)
edf.sample(n=5)

Unnamed: 0,text,emotion,contractions_removed
7680,i dont recall just now yet vividly recall look...,love,i dont recall just now yet vividly recall look...
5747,i feel so embarrassed of myself for even havin...,sadness,i feel so embarrassed of myself for even havin...
4466,i would feel so excited waiting for the mailma...,joy,i would feel so excited waiting for the mailma...
14576,i growled at her i began to feel extremely ann...,anger,i growled at her i began to feel extremely ann...
4460,i am right now i feel amused the sounds i hear...,joy,i am right now i feel amused the sounds i hear...


In [14]:
edf.sample(n=5)

Unnamed: 0,text,emotion,contractions_removed
4198,i feel that learning more about animals and th...,joy,i feel that learning more about animals and th...
9552,i am feeling a bit apprehensive about carrying...,fear,i am feeling a bit apprehensive about carrying...
7054,im feeling very bitter against knight in shini...,anger,im feeling very bitter against knight in shini...
10405,i don t always remember to do this but when i ...,anger,i always remember to do this but when feeling ...
6786,i feel shes friendly and nice,joy,i feel shes friendly and nice


In [15]:
edf.drop(columns=['contractions_removed'], inplace=True)
edf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     15999 non-null  object
 1   emotion  15999 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


## Binary Label: Negative Emotion ("Stress") as 1

In [3]:
edf['emotion'].value_counts()

emotion
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [4]:
vdf['emotion'].value_counts()

emotion
joy         704
sadness     549
anger       275
fear        212
love        178
surprise     81
Name: count, dtype: int64

Just going to roughly group "sadness", "anger", and "fear" into "stressed"

In [35]:
possible_stress = {'sadness', 'anger', 'fear'}
datasets = [train_df, validation_df, tdf]

In [47]:
for df in datasets:
    df['label'] = df['emotion'].apply(lambda emotion: 1 if emotion.strip() in possible_stress else 0)
    print(df.head(2))

                                                   text   emotion  label
3980  i feel drastically inadequate for the needs i ...   sadness      1
6351  i dont want to put to much pressure on myself ...  surprise      0
                                                    text  emotion  label
1801                    i still feel completely accepted      joy      0
11253  i still feel guilty to this day for taking a spot  sadness      1


## Text Processing

In [48]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from urllib.parse import urlparse

In [49]:
lemmatizer = WordNetLemmatizer()
stop_words = list(stopwords.words('english'))

In [50]:
print(len(stop_words))
utils.add_stopwords_missing_apostrophe(stop_words)
print(len(stop_words))

179
205


In [52]:
train_df['processed_text'] = train_df['text'].apply(lambda raw_text: utils.process_text(
    text_chunk=raw_text, stopwords=stop_words, lemmatizer_obj=lemmatizer
))

train_df.sample(n=3)

Unnamed: 0,text,emotion,label,processed_text
639,i feel so carefree nowwwwww,joy,0,feel carefree nowwwwww
15963,i was more annoyed with the info dump because ...,anger,1,annoyed info dump made book long feel miss som...
12712,i lied about my feelings and thats why im now ...,sadness,1,lied feeling thats im hated one person thought...


In [53]:
validation_df['processed_text'] = validation_df['text'].apply(lambda raw_text: utils.process_text(
    text_chunk=raw_text, stopwords=stop_words, lemmatizer_obj=lemmatizer
))
validation_df.sample(n=3)

Unnamed: 0,text,emotion,label,processed_text
7911,i feel intimidated by your question,fear,1,feel intimidated question
8174,i havent gotten them yet because i still resen...,anger,1,gotten yet still resent paying dollar procedur...
15071,im feeling generous this morning i will share ...,love,0,im feeling generous morning share


In [54]:
tdf['processed_text'] = tdf['text'].apply(lambda raw_text: utils.process_text(
    text_chunk=raw_text, stopwords=stop_words, lemmatizer_obj=lemmatizer
))
tdf.sample(n=3)

Unnamed: 0,text,emotion,label,processed_text
220,i feel alarmed,fear,1,feel alarmed
1824,i still feel like there are more than enough t...,joy,0,still feel like enough keep entertained still ...
3099,im feeling good i increase,joy,0,im feeling good increase


In [66]:
train_df['post_len'] = train_df['processed_text'].apply(lambda text: len(text))
train_df['post_len'].describe()

count    12799.000000
mean        60.596844
std         34.365817
min          4.000000
25%         34.000000
50%         54.000000
75%         79.000000
max        229.000000
Name: post_len, dtype: float64

# ML Models

In [33]:
edf['processed_text'] = edf['text'].apply(lambda raw_text: utils.process_text(
    text_chunk=raw_text, stopwords=stop_words, lemmatizer_obj=lemmatizer
))
edf.sample(n=5)

Unnamed: 0,text,emotion,label,processed_text
9496,ive also begun going through a round of self e...,joy,0,ive also begun going round self edits first ep...
8745,i cant be bothered as coming and doing is a pl...,sadness,1,cant bothered coming pleasant pas time followe...
7461,im not sure if all my stuff with andy as in me...,anger,1,im sure stuff andy feeling annoyed messed chem...
10906,i would feel disheartened so i would then go a...,sadness,1,would feel disheartened would go cardio anothe...
116,i feel all of this just from her eyes not from...,sadness,1,feel eye touch word eye know assuredly return ...


## Vectorize (Word Embedding)

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

### Experiment with Max and Min Document Frequencies for a Reasonable Vocabulary Size

In [24]:
doc_freq_maxes = [0.9, 0.8, 0.7, 0.6, 0.5]

In [27]:
for ceiling in doc_freq_maxes:
    tfidf = TfidfVectorizer(max_df=ceiling)
    tf_df = tfidf.fit_transform(edf['processed_text'])
    tf_df.toarray()
    print(f'Max doc freq: {ceiling}\nTerms: {tf_df.shape[1]}\n')

Max doc freq: 0.9
Terms: 13435

Max doc freq: 0.8
Terms: 13435

Max doc freq: 0.7
Terms: 13435

Max doc freq: 0.6
Terms: 13434

Max doc freq: 0.5
Terms: 13434



In [32]:
doc_freq_mins = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.]

In [33]:
for floor in doc_freq_mins:
    tfidf = TfidfVectorizer(min_df=floor)
    tf_df = tfidf.fit_transform(edf['processed_text'])
    tf_df.toarray()
    print(f'Min doc freq: {floor}\nTerms: {tf_df.shape[1]}\n')

Min doc freq: 0.1
Terms: 4

Min doc freq: 0.01
Terms: 101

Min doc freq: 0.001
Terms: 1363

Min doc freq: 0.0001
Terms: 6496

Min doc freq: 1e-05
Terms: 13435

Min doc freq: 0.0
Terms: 13435



### Actual Embedding

In [35]:
# Try max_df = 0.6 and min_df = 0.0001
MAX_DF = 0.6
MIN_DF = 0.0001

In [36]:
tfidf = TfidfVectorizer(min_df=MIN_DF, max_df=MAX_DF)
tf_df = tfidf.fit_transform(edf['processed_text'])
tf_df.toarray()
tf_df.shape

(15999, 6495)

In [48]:
tf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15999 entries, 0 to 15998
Columns: 6495 entries, aa to zumba
dtypes: float64(6495)
memory usage: 792.8 MB


In [37]:
tf_df = pd.DataFrame(tf_df.toarray(), columns=tfidf.get_feature_names_out())
tf_df.sample(n=3)

Unnamed: 0,aa,abandon,abandoned,abandoning,abandonment,abc,abdomen,abide,ability,abit,...,zach,zealand,zen,zero,zest,zombie,zone,zoom,zooming,zumba
10832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tf_df.describe()

Unnamed: 0,aa,abandon,abandoned,abandoning,abandonment,abc,abdomen,abide,ability,abit,...,zach,zealand,zen,zero,zest,zombie,zone,zoom,zooming,zumba
count,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,...,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0,15999.0
mean,8.8e-05,8e-05,0.00025,7.6e-05,9.2e-05,0.000111,0.000112,8.4e-05,0.000784,0.000138,...,7.2e-05,9.6e-05,5.3e-05,0.000137,4.9e-05,0.000127,0.000239,5.1e-05,5.4e-05,6.7e-05
std,0.006486,0.006044,0.012614,0.006844,0.006779,0.007231,0.007563,0.006296,0.016766,0.009148,...,0.006537,0.006198,0.004726,0.007408,0.004508,0.007301,0.00974,0.004746,0.005011,0.004937
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.521035,0.581781,0.818975,0.66778,0.586314,0.610155,0.729999,0.594738,0.541364,0.77192,...,0.676502,0.491465,0.430916,0.594343,0.48805,0.492643,0.513285,0.531097,0.552705,0.384099


## Train/Test Split

In [41]:
# Either run this cell or the next, NOT both - this is for reproducible shuffling of train/test data
random_seed = randint(0, 50)
random_seed

5

In [38]:
# Either run this cell or the above, NOT both
random_seed = 5

In [39]:
from sklearn.model_selection import train_test_split

In [47]:
tf_df.head(2)

Unnamed: 0,aa,abandon,abandoned,abandoning,abandonment,abc,abdomen,abide,ability,abit,...,zach,zealand,zen,zero,zest,zombie,zone,zoom,zooming,zumba
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    tf_df, edf['label'], test_size=0.2, random_state=random_seed, stratify=edf['label']
)
X_train.shape, y_train.shape

((12799, 6495), (12799,))

In [41]:
X_test.shape, y_test.shape

((3200, 6495), (3200,))

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

## Logistic Regression

In [46]:
lr_model = LogisticRegression().fit(X_train, y_train)
lr_model.score(X_test, y_test)

0.95375

## Naive Bayes

In [47]:
nb_model = MultinomialNB().fit(X_train, y_train)
nb_model.score(X_test, y_test)

0.933125

## Random Forest

In [48]:
rf_model = RandomForestClassifier().fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.940625

# DL Models

In [55]:
import tensorflow as tf

## Text Processing

Default settings:
+ lowercase
+ remove punctuation
+ tokenize by splitting on whitespace
+ CANNOT run on GPU, only CPU - use in tf.data pipeline, NOT as part of the model, for less of a bottleneck

In [56]:
train_df.columns

Index(['text', 'emotion', 'label', 'processed_text'], dtype='object')

In [57]:
y_train = train_df['label']
y_train = tf.convert_to_tensor(y_train)

2024-03-09 20:25:18.571040: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-09 20:25:18.590233: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-09 20:25:18.590646: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [61]:
y_val = validation_df['label']
y_val = tf.convert_to_tensor(y_val)

y_test = tdf['label']
y_test = tf.convert_to_tensor(y_test)

In [62]:
X_train = tf.convert_to_tensor(train_df['processed_text'])
X_val = tf.convert_to_tensor(validation_df['processed_text'])
X_test = tf.convert_to_tensor(tdf['processed_text'])

In [63]:
X_train

<tf.Tensor: shape=(12799,), dtype=string, numpy=
array([b'feel drastically inadequate need feel swirling around',
       b'want put much pressure feel like could make amazing year ever',
       b'love latter smooth feel delicious flavour mention awesome glossy appearance',
       ..., b'cry front child feel overwhelmed without moment notice',
       b'ive heard feeling many time discussed uncertain future many people conversation blending together',
       b'feel idiotic since im going bring completely separate issue'],
      dtype=object)>

In [67]:
MAX_TOKENS = 20_000

In [68]:
from tensorflow.keras.layers import TextVectorization
text_vect = TextVectorization(
    output_mode='int',
    max_tokens=MAX_TOKENS,
)
text_vect.adapt(X_train)

In [71]:
int_X_train = text_vect(X_train)
int_X_train

<tf.Tensor: shape=(12799, 35), dtype=int64, numpy=
array([[   2, 5381,  604, ...,    0,    0,    0],
       [  14,  132,   28, ...,    0,    0,    0],
       [  25, 3060, 2906, ...,    0,    0,    0],
       ...,
       [ 229,  499,  102, ...,    0,    0,    0],
       [  22,  862,    3, ...,    0,    0,    0],
       [   2,  859,  111, ...,    0,    0,    0]])>

In [74]:
int_X_train.shape[0]

12799

In [72]:
int_X_val = text_vect(X_val)
int_X_test = text_vect(X_test)

In [73]:
BUFFER_SIZE = 5000 # Not sure why tutorial has 10K
BATCH_SIZE = 32
EPOCHS = 50

## RNN

Adapted from [TensorFlow Text Classification RNN Tutorial](https://www.tensorflow.org/text/tutorials/text_classification_rnn)

"RNNs (and Transformers) can learn representations for groups of words...without being explicitly told about the existence of such groups, by looking at continuous word...sequences" (_Deep Learning with Python: 2nd Ed_, Ch. 11)
+ Typically restrict vocab to top 20K-30K most common tokens

In [75]:
from tensorflow.keras import layers

In [78]:
inputs = tf.keras.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(
    input_dim=int_X_train.shape[0], output_dim=256, mask_zero=True
)(inputs)

# Bidirectional LSTM
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)

# Classification layer
outputs = layers.Dense(1, activation='sigmoid')(x)

lstm_model = tf.keras.Model(inputs, outputs)

In [79]:
lstm_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         3276544   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3350593 (12.78 MB)
Trainable params: 3350593 (12.78 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

In [80]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint('embeddings_bidir_gru_with_masking.keras', save_best_only=True)
]

In [81]:
PATIENCE = 3

In [84]:
early_stop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=PATIENCE
)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'lstm.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min'
)

In [85]:
lstm_history = lstm_model.fit(
    int_X_train, y_train, validation_data=(int_X_val, y_val), 
    batch_size=BATCH_SIZE, epochs=EPOCHS, 
    callbacks=[early_stop_callback, checkpoint_callback]
)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.11997, saving model to lstm.h5
Epoch 2/50
  5/400 [..............................] - ETA: 6s - loss: 0.0377 - accuracy: 0.9937

  saving_api.save_model(


Epoch 2: val_loss improved from 0.11997 to 0.10556, saving model to lstm.h5
Epoch 3/50
Epoch 3: val_loss did not improve from 0.10556
Epoch 4/50
Epoch 4: val_loss did not improve from 0.10556
Epoch 5/50
Epoch 5: val_loss did not improve from 0.10556


In [87]:
model = tf.keras.models.load_model('lstm.h5')
print(f'Test accuracy: {model.evaluate(int_X_test, y_test)}')

Test accuracy: [0.10277608036994934, 0.9677338600158691]


## Try with Original Stress Detection Data??

In [88]:
og_data = pd.read_csv('data/Stress.csv')
og_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2838 entries, 0 to 2837
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   subreddit         2838 non-null   object 
 1   post_id           2838 non-null   object 
 2   sentence_range    2838 non-null   object 
 3   text              2838 non-null   object 
 4   label             2838 non-null   int64  
 5   confidence        2838 non-null   float64
 6   social_timestamp  2838 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 155.3+ KB


In [89]:
og_data = og_data[['text', 'label']].copy()
og_data.sample(n=2)

Unnamed: 0,text,label
1420,I grew up my country’s equivalent of the bible...,1
1921,Obviously this was a horrible idea--I would ch...,0


In [91]:
og_data['processed_text'] = og_data['text'].apply(lambda raw: utils.process_text(text_chunk=raw, stopwords=stop_words, lemmatizer_obj=lemmatizer))
og_data.sample(n=3)

Unnamed: 0,text,label,processed_text
2350,"If you want to stay in and watch Netflix, STAY...",0,want stay watch netflix stay mental health men...
521,Can you contribute your story alongside others...,0,contribute story alongside others going recove...
2081,I just told them I had too much anxiety and br...,1,told much anxiety breaking boyfriend made real...


In [92]:
X_og = tf.convert_to_tensor(og_data['processed_text'])
y_og = tf.convert_to_tensor(og_data['label'])

In [93]:
int_X_og = text_vect(X_og)

In [94]:
print(f'Test accuracy: {model.evaluate(int_X_og, y_og)}')

Test accuracy: [1.4716436862945557, 0.6402395963668823]
