In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
# Loading the dataset
df = pd.read_csv('stress_dataset_550.csv')
df

Unnamed: 0,response,label
0,Hosted a movie marathon at home with friends. ...,Normal
1,Today was magical â€“ had my first kiss! The r...,Normal
2,I feel like I'm always one step away from fina...,Stress
3,I'm so tired of feeling like this. It's like I...,Stress
4,DIY craft session â€“ creativity and good vibes,Normal
...,...,...
544,Successfully completed a challenging puzzle. T...,Normal
545,It's like I'm living in a pressure cooker at h...,Stress
546,Surprise reunion with old friends today. Laugh...,Normal
547,Tight deadlines at work are making me short-te...,Stress


In [3]:
# remove punctuations
import re
import string

def remove_punct (response):
    translator = str.maketrans("","",string.punctuation)
    return response.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
# removing punctuations from the response column in the dataset by mapping the function
df["response"]=df.response.map(remove_punct)

In [5]:
# removing stop words
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

def remove_stopwords(response):
    filtered_words = [word.lower() for word in response.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [7]:
df["response"] = df.response.map(remove_stopwords)

In [8]:
df.response

0      hosted movie marathon home friends watching se...
1      today magical â€“ first kiss rush emotions swe...
2      feel like im always one step away financial di...
3      im tired feeling like like im trapped neverend...
4            diy craft session â€“ creativity good vibes
                             ...                        
544    successfully completed challenging puzzle sens...
545    like im living pressure cooker home parents co...
546    surprise reunion old friends today laughter sh...
547    tight deadlines work making shorttempered cant...
548    im absolutely thrilled upcoming family reunion...
Name: response, Length: 549, dtype: object

In [9]:
# preparing the dataset into a mode where we can use these in a rnn model
from collections import Counter

# count unique words
def counter_word(response_col):
    count = Counter()
    for text in response_col.values:
        for word in text.split():
            count [word] += 1
    return count

counter = counter_word(df.response)


In [10]:
len(counter)

2057

In [11]:
counter

Counter({'im': 279,
         'like': 162,
         'joy': 83,
         'friends': 81,
         'constant': 76,
         'pressure': 72,
         'feeling': 66,
         'cant': 56,
         'day': 56,
         'feel': 55,
         'feels': 54,
         'â€“': 51,
         'today': 49,
         'new': 45,
         'laughter': 44,
         'causing': 43,
         'financial': 39,
         'sense': 38,
         'making': 37,
         'time': 36,
         'night': 35,
         'stress': 34,
         'work': 33,
         'always': 32,
         'life': 32,
         'job': 31,
         'made': 31,
         'constantly': 31,
         'good': 29,
         'fear': 29,
         'hard': 28,
         'shared': 27,
         'experience': 26,
         'anxiety': 25,
         'know': 25,
         'spent': 25,
         'find': 25,
         'one': 24,
         'make': 24,
         'keep': 24,
         'expectations': 23,
         'much': 23,
         'happiness': 23,
         'home': 22,
         'overw

In [12]:
counter.most_common(20)

[('im', 279),
 ('like', 162),
 ('joy', 83),
 ('friends', 81),
 ('constant', 76),
 ('pressure', 72),
 ('feeling', 66),
 ('cant', 56),
 ('day', 56),
 ('feel', 55),
 ('feels', 54),
 ('â€“', 51),
 ('today', 49),
 ('new', 45),
 ('laughter', 44),
 ('causing', 43),
 ('financial', 39),
 ('sense', 38),
 ('making', 37),
 ('time', 36)]

In [13]:
# passing the length of the word count to a variable
unique_word_count=len(counter)

In [14]:
# splitting dataset into training and validation sets
train_size = int(df.shape[0] * 0.8)

train_df = df [:train_size]
val_df = df [train_size:]

# split responses and labels
train_responses = train_df.response.to_numpy()
train_labels = train_df.label.to_numpy()
val_responses = val_df.response.to_numpy()
val_labels = val_df.label.to_numpy()

In [15]:
train_responses.shape , val_responses.shape

((439,), (110,))

In [16]:
import numpy as np

# Assuming train_labels and val_labels are NumPy arrays of strings
# Convert string labels to numerical values

for i in range(len(df)):
    if df.loc[i, 'label'] == "Stress":
        df.loc[i, 'label'] = 1
    elif df.loc[i, 'label'] == "Normal":
        df.loc[i, 'label'] = 0
    
# Convert string labels to numerical values
#train_labels_numeric = np.where(train_labels == "Normal", 0, 1)
#val_labels_numeric = np.where(val_labels == "Stress", 1, 0)


In [17]:
df

Unnamed: 0,response,label
0,hosted movie marathon home friends watching se...,0
1,today magical â€“ first kiss rush emotions swe...,0
2,feel like im always one step away financial di...,1
3,im tired feeling like like im trapped neverend...,1
4,diy craft session â€“ creativity good vibes,0
...,...,...
544,successfully completed challenging puzzle sens...,0
545,like im living pressure cooker home parents co...,1
546,surprise reunion old friends today laughter sh...,0
547,tight deadlines work making shorttempered cant...,1


In [18]:
# tokenization
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 

# vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words = unique_word_count)
tokenizer.fit_on_texts(train_responses) # fit only for training




In [19]:
# each word has unique index
word_index = tokenizer.word_index

In [20]:
word_index

{'im': 1,
 'like': 2,
 'joy': 3,
 'constant': 4,
 'friends': 5,
 'pressure': 6,
 'feeling': 7,
 'day': 8,
 'today': 9,
 'feel': 10,
 'â€“': 11,
 'cant': 12,
 'new': 13,
 'feels': 14,
 'causing': 15,
 'laughter': 16,
 'financial': 17,
 'time': 18,
 'sense': 19,
 'making': 20,
 'stress': 21,
 'always': 22,
 'night': 23,
 'job': 24,
 'hard': 25,
 'life': 26,
 'work': 27,
 'fear': 28,
 'good': 29,
 'constantly': 30,
 'made': 31,
 'find': 32,
 'make': 33,
 'shared': 34,
 'anxiety': 35,
 'know': 36,
 'experience': 37,
 'expectations': 38,
 'nature': 39,
 'much': 40,
 'home': 41,
 'keep': 42,
 'every': 43,
 'overwhelming': 44,
 'one': 45,
 'tired': 46,
 'dont': 47,
 'overwhelmed': 48,
 'trying': 49,
 'spent': 50,
 'happiness': 51,
 'going': 52,
 'local': 53,
 'future': 54,
 'things': 55,
 'completed': 56,
 'family': 57,
 'matter': 58,
 'everything': 59,
 'friend': 60,
 'music': 61,
 'even': 62,
 'living': 63,
 'affecting': 64,
 'project': 65,
 'never': 66,
 'love': 67,
 'meet': 68,
 'happy': 

In [21]:
train_sequences = tokenizer.texts_to_sequences(train_responses)
val_sequences = tokenizer.texts_to_sequences(val_responses)

In [22]:
print(train_responses[10])
print(train_sequences[10])

joined mindfulness meditation class peace selfawareness practice living present moment become valuable tools maintaining happy inner balance
[169, 323, 324, 170, 95, 606, 429, 63, 315, 85, 325, 607, 608, 609, 69, 430, 431]


In [23]:
# pad the sequence to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# max number of words in a sequence
max_length = 15

# the use of using padding and truncating here as "post" , is to use 0s for the rest of the sequence 
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
train_padded.shape , val_padded.shape

((439, 15), (110, 15))

In [24]:
train_padded[10]

array([169, 323, 324, 170,  95, 606, 429,  63, 315,  85, 325, 607, 608,
       609,  69])

In [25]:
print(train_responses[10])
print(train_sequences[10])
print(train_padded[10])

joined mindfulness meditation class peace selfawareness practice living present moment become valuable tools maintaining happy inner balance
[169, 323, 324, 170, 95, 606, 429, 63, 315, 85, 325, 607, 608, 609, 69, 430, 431]
[169 323 324 170  95 606 429  63 315  85 325 607 608 609  69]


In [26]:
# check reversing the indices
# flip (key , value)

reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [46]:
reverse_word_index

{1: 'im',
 2: 'like',
 3: 'joy',
 4: 'constant',
 5: 'friends',
 6: 'pressure',
 7: 'feeling',
 8: 'day',
 9: 'today',
 10: 'feel',
 11: 'â€“',
 12: 'cant',
 13: 'new',
 14: 'feels',
 15: 'causing',
 16: 'laughter',
 17: 'financial',
 18: 'time',
 19: 'sense',
 20: 'making',
 21: 'stress',
 22: 'always',
 23: 'night',
 24: 'job',
 25: 'hard',
 26: 'life',
 27: 'work',
 28: 'fear',
 29: 'good',
 30: 'constantly',
 31: 'made',
 32: 'find',
 33: 'make',
 34: 'shared',
 35: 'anxiety',
 36: 'know',
 37: 'experience',
 38: 'expectations',
 39: 'nature',
 40: 'much',
 41: 'home',
 42: 'keep',
 43: 'every',
 44: 'overwhelming',
 45: 'one',
 46: 'tired',
 47: 'dont',
 48: 'overwhelmed',
 49: 'trying',
 50: 'spent',
 51: 'happiness',
 52: 'going',
 53: 'local',
 54: 'future',
 55: 'things',
 56: 'completed',
 57: 'family',
 58: 'matter',
 59: 'everything',
 60: 'friend',
 61: 'music',
 62: 'even',
 63: 'living',
 64: 'affecting',
 65: 'project',
 66: 'never',
 67: 'love',
 68: 'meet',
 69: 'happ

In [28]:
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [29]:
decoded_response = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_response)

[169, 323, 324, 170, 95, 606, 429, 63, 315, 85, 325, 607, 608, 609, 69, 430, 431]
joined mindfulness meditation class peace selfawareness practice living present moment become valuable tools maintaining happy inner balance


In [30]:
#create LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# embedding: turns positive integers(indexes) into dense vectors of fixed size

model = Sequential()
model.add(layers.Embedding(unique_word_count, 42, input_length=max_length))

# the layer will take as input an integer matrix of size (batch, input_length)
# now model.output_shape os (None, input_length, 32), where 'None' is the batch dimension

model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 42)            86394     
                                                                 
 lstm (LSTM)                 (None, 64)                27392     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 113851 (444.73 KB)
Trainable params: 113851 (444.73 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [31]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

loss = BinaryCrossentropy(from_logits=False)
optim = Adam(learning_rate = 0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [32]:
val_padded

array([[   1,   45, 1491, ..., 1166,  617,    0],
       [  40,  258,   47, ...,    0,    0,    0],
       [ 679,   17,   97, ...,    0,    0,    0],
       ...,
       [ 285,  366,    5, ...,    0,    0,    0],
       [ 425,  256,   27, ...,   38,  133,  134],
       [   1,  548, 1377, ...,    0,    0,    0]])

In [33]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the training labels
train_labels_numeric = label_encoder.fit_transform(train_labels)

# Transform the validation labels
val_labels_numeric = label_encoder.fit_transform(val_labels)


In [34]:
model.fit(train_padded, train_labels_numeric, epochs=10, validation_data=(val_padded, val_labels_numeric), verbose=2)


Epoch 1/10


14/14 - 7s - loss: 0.6878 - accuracy: 0.5558 - val_loss: 0.6705 - val_accuracy: 0.6091 - 7s/epoch - 497ms/step
Epoch 2/10
14/14 - 0s - loss: 0.6018 - accuracy: 0.7016 - val_loss: 0.3841 - val_accuracy: 0.9727 - 199ms/epoch - 14ms/step
Epoch 3/10
14/14 - 0s - loss: 0.1672 - accuracy: 0.9727 - val_loss: 0.1636 - val_accuracy: 0.9364 - 222ms/epoch - 16ms/step
Epoch 4/10
14/14 - 0s - loss: 0.0721 - accuracy: 0.9795 - val_loss: 0.2151 - val_accuracy: 0.9364 - 186ms/epoch - 13ms/step
Epoch 5/10
14/14 - 0s - loss: 0.0470 - accuracy: 0.9886 - val_loss: 0.1699 - val_accuracy: 0.9455 - 192ms/epoch - 14ms/step
Epoch 6/10
14/14 - 0s - loss: 0.0087 - accuracy: 1.0000 - val_loss: 0.1398 - val_accuracy: 0.9636 - 206ms/epoch - 15ms/step
Epoch 7/10
14/14 - 0s - loss: 0.0059 - accuracy: 0.9977 - val_loss: 0.1514 - val_accuracy: 0.9636 - 202ms/epoch - 14ms/step
Epoch 8/10
14/14 - 0s - loss: 0.0028 - accuracy: 1.0000 - val_loss: 0.1779 - val_accuracy: 0.9455 - 228ms/epoch - 16ms/step
Epoch 9/

<keras.src.callbacks.History at 0x27733e1de50>

In [35]:
predictions = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

predictions[20]



1

In [36]:
print(train_responses[20:25])
print(train_labels[20:25])
print(predictions[20:25])

['feel like im trying reason situation though im listing things person would associate success im seeing things im failing'
 'went camping trip friends sitting around campfire sharing stories enjoying nature stressfree happy environment'
 'spent day exploring historical museum knowledge gained connection past appreciation cultural heritage made intellectually stimulating enjoyable outing'
 'feel like im drowning work matter hard try cant seem keep causing much stress frustration'
 'im juggling multiple responsibilities fatigue setting burnout real im feeling exhausted mentally physically like cant catch break']
[1 0 0 1 1]
[1, 0, 0, 1, 1]


In [37]:
# Preprocess custom data
custom_text = "this is too tiring"
custom_text = remove_punct(custom_text)
custom_text = remove_stopwords(custom_text)
custom_sequence = tokenizer.texts_to_sequences([custom_text])
custom_padded = pad_sequences(custom_sequence, maxlen=max_length, padding="post", truncating="post")

# Make predictions
custom_prediction = model.predict(custom_padded)

# Post-process predictions
if custom_prediction > 0.5:
    print("Predicted class: Stress")
else:
    print("Predicted class: Normal")


Predicted class: Stress


In [38]:
loss, accuracy = model.evaluate(val_padded, val_labels_numeric)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9545454382896423


In [43]:
print(custom_text)
print(custom_padded)
print(reverse_word_index[45])

tiring
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
one


In [40]:
#pickle.dump(model, open('model.pkl', 'wb'))
with open ('model.pkl','wb') as file:
    pickle.dump(model,file)
    
model.save('./model.keras')