In [3]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, GlobalMaxPooling1D, Input, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv("/content/stress_dataset.csv")

In [5]:
df.head()

Unnamed: 0,Employee_ID,Message,Word_Count,Sentiment_Score,Employee_Role,Department,Stress_Level
0,1,Looking forward to the weekend!,5,0.0,Data Analyst,IT,Low
1,2,Enjoying my work today!,4,0.62,HR Specialist,HR,Low
2,3,Workload is manageable but challenging.,5,0.5,Data Analyst,IT,Medium
3,4,Feeling great about my progress.,5,0.8,Sales Executive,Sales,Low
4,5,Loving the collaborative environment.,4,0.6,Manager,Sales,Low


In [6]:
# mapping stress_level as high -> 1 and low -> 0, removing medium
df = df[df['Stress_Level'] != 'Medium']
df['label'] = df['Stress_Level'].str.lower().map({'high': 1, 'low': 0})
df = df.drop(columns=['Stress_Level'])
df = df.reset_index(drop=True)
print(df.head())
df.to_csv("stress_data_cleaned.csv", index=False)

   Employee_ID                                  Message  Word_Count  \
0            1          Looking forward to the weekend!           5   
1            2                  Enjoying my work today!           4   
2            4         Feeling great about my progress.           5   
3            5    Loving the collaborative environment.           4   
4            8  Overloaded with work, this is too much!           7   

   Sentiment_Score    Employee_Role Department  label  
0             0.00     Data Analyst         IT      0  
1             0.62    HR Specialist         HR      0  
2             0.80  Sales Executive      Sales      0  
3             0.60          Manager      Sales      0  
4             0.25    HR Specialist      Sales      1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['Stress_Level'].str.lower().map({'high': 1, 'low': 0})


In [7]:
from google.colab import files
files.download('stress_data_cleaned.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
df.head()

Unnamed: 0,Employee_ID,Message,Word_Count,Sentiment_Score,Employee_Role,Department,label
0,1,Looking forward to the weekend!,5,0.0,Data Analyst,IT,0
1,2,Enjoying my work today!,4,0.62,HR Specialist,HR,0
2,4,Feeling great about my progress.,5,0.8,Sales Executive,Sales,0
3,5,Loving the collaborative environment.,4,0.6,Manager,Sales,0
4,8,"Overloaded with work, this is too much!",7,0.25,HR Specialist,Sales,1


In [9]:
df1 = pd.read_csv("stress_data_cleaned.csv")
df2 = pd.read_csv("/content/Stress.csv")

df1.columns = df1.columns.str.lower()
df2.columns = df2.columns.str.lower()

# renaming 'message' column in df1 to 'text'
df1.rename(columns={'message': 'text'}, inplace=True)

# keeping only the relevant columns
df1 = df1[['text', 'label']]
df2 = df2[['text', 'label']]

# combining datasets and shuffling them
df_combined = pd.concat([df1, df2], ignore_index=True).dropna()
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# saving combined dataset
df_combined.to_csv("combined_stress_dataset.csv", index=False)

print("Combined dataset shape:", df_combined.shape)
print(df_combined.head())


Combined dataset shape: (9804, 2)
                                                text  label
0                       Excited for the team outing!      0
1  I don't really know if I'm doing this right bu...      1
2                   Feeling great about my progress.      0
3                            Work is going smoothly.      0
4                              I feel like quitting.      1


In [10]:
def clean_text(text):
    text = text.lower()  # converting to lowercase
    text = re.sub(r'\W', ' ', text)  # removing special characters
    text = re.sub(r'\s+', ' ', text).strip()  # removing extra spaces
    return text

In [11]:
# text preprocessing
df_combined['clean_text'] = df_combined['text'].astype(str).apply(clean_text)

In [12]:
label_encoder = LabelEncoder()
df_combined['label'] = label_encoder.fit_transform(df_combined['label'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_combined['clean_text'], df_combined['label'], test_size=0.2, random_state=42)

In [14]:
tokenizer = Tokenizer(num_words=5000)  # uses the top 5000 words
tokenizer.fit_on_texts(X_train)

In [15]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
max_len = max(len(seq) for seq in X_train_seq)  # finding max length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [None]:
# model 1 -> pure bilstm
model1 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [None]:
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model1.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 2s/step - accuracy: 0.8023 - loss: 0.3646 - val_accuracy: 0.8995 - val_loss: 0.2368
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 2s/step - accuracy: 0.9275 - loss: 0.1542 - val_accuracy: 0.9097 - val_loss: 0.1843
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 2s/step - accuracy: 0.9623 - loss: 0.0912 - val_accuracy: 0.9001 - val_loss: 0.2212
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m433s[0m 2s/step - accuracy: 0.9815 - loss: 0.0540 - val_accuracy: 0.9052 - val_loss: 0.2837
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 2s/step - accuracy: 0.9911 - loss: 0.0255 - val_accuracy: 0.9011 - val_loss: 0.2834


<keras.src.callbacks.history.History at 0x7946c6e82950>

In [None]:
loss, accuracy = model1.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 469ms/step - accuracy: 0.9007 - loss: 0.2754
Test Accuracy: 0.90


In [None]:
# model 2 -> pure cnn
model2 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),  # embedding layer
    Conv1D(filters=128, kernel_size=5, activation='relu'),  # convolution layer
    MaxPooling1D(pool_size=2),  # max pooling
    Dropout(0.3),
    Flatten(),  # flatten to feed into dense layers
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # output layer (binary)
])

In [None]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model2.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 143ms/step - accuracy: 0.7865 - loss: 0.3866 - val_accuracy: 0.9291 - val_loss: 0.1575
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 140ms/step - accuracy: 0.9663 - loss: 0.0941 - val_accuracy: 0.9118 - val_loss: 0.1945
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 138ms/step - accuracy: 0.9937 - loss: 0.0206 - val_accuracy: 0.9194 - val_loss: 0.2731
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 146ms/step - accuracy: 0.9982 - loss: 0.0056 - val_accuracy: 0.9215 - val_loss: 0.3198
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 138ms/step - accuracy: 0.9999 - loss: 0.0011 - val_accuracy: 0.9153 - val_loss: 0.3453


<keras.src.callbacks.history.History at 0x7946c6d338d0>

In [None]:
loss, accuracy = model2.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9244 - loss: 0.3070
Test Accuracy: 0.92


In [None]:
# model 3 =>  cnn -> bilstm
model3 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model3.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 712ms/step - accuracy: 0.7932 - loss: 0.3839 - val_accuracy: 0.8914 - val_loss: 0.1992
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 705ms/step - accuracy: 0.9261 - loss: 0.1492 - val_accuracy: 0.9077 - val_loss: 0.1792
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 709ms/step - accuracy: 0.9726 - loss: 0.0770 - val_accuracy: 0.9082 - val_loss: 0.2044
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 710ms/step - accuracy: 0.9864 - loss: 0.0456 - val_accuracy: 0.9006 - val_loss: 0.2771
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 710ms/step - accuracy: 0.9919 - loss: 0.0286 - val_accuracy: 0.8934 - val_loss: 0.3834


<keras.src.callbacks.history.History at 0x7946c697abd0>

In [None]:
loss, accuracy = model3.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 189ms/step - accuracy: 0.8949 - loss: 0.3586
Test Accuracy: 0.89


In [None]:
# model 4 =>  bilstm -> cnn
model4 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model4.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 1s/step - accuracy: 0.8191 - loss: 0.3616 - val_accuracy: 0.9092 - val_loss: 0.1636
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.9391 - loss: 0.1354 - val_accuracy: 0.9062 - val_loss: 0.1745
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 1s/step - accuracy: 0.9685 - loss: 0.0734 - val_accuracy: 0.9036 - val_loss: 0.2108
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 1s/step - accuracy: 0.9858 - loss: 0.0429 - val_accuracy: 0.9052 - val_loss: 0.3176
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 1s/step - accuracy: 0.9929 - loss: 0.0208 - val_accuracy: 0.9021 - val_loss: 0.3712


<keras.src.callbacks.history.History at 0x7946b58587d0>

In [None]:
loss, accuracy = model4.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 332ms/step - accuracy: 0.9114 - loss: 0.3528
Test Accuracy: 0.90


In [18]:
# model 5 =>  parallel cnn and bilstm
input_layer = Input(shape=(max_len,))
embedding = Embedding(input_dim=5000, output_dim=128, input_length=max_len)(input_layer)

# CNN Branch
cnn_branch = Conv1D(128, 5, activation='relu')(embedding)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# BiLSTM Branch
lstm_branch = Bidirectional(LSTM(128, return_sequences=False))(embedding)

# Concatenate both branches
merged = Concatenate()([cnn_branch, lstm_branch])
dense = Dense(64, activation='relu')(merged)
dropout = Dropout(0.5)(dense)
output = Dense(1, activation='sigmoid')(dropout)

model5 = Model(inputs=input_layer, outputs=output)



In [19]:
model5.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model5.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 1000ms/step - accuracy: 0.8122 - loss: 0.3462 - val_accuracy: 0.9001 - val_loss: 0.1861
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 974ms/step - accuracy: 0.9440 - loss: 0.1276 - val_accuracy: 0.9159 - val_loss: 0.1715
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 963ms/step - accuracy: 0.9863 - loss: 0.0434 - val_accuracy: 0.9194 - val_loss: 0.2028
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 956ms/step - accuracy: 0.9984 - loss: 0.0098 - val_accuracy: 0.9179 - val_loss: 0.2765
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 956ms/step - accuracy: 0.9999 - loss: 0.0026 - val_accuracy: 0.9133 - val_loss: 0.3289


<keras.src.callbacks.history.History at 0x7b8b2be122d0>

In [21]:
loss, accuracy = model5.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 273ms/step - accuracy: 0.9116 - loss: 0.3243
Test Accuracy: 0.91


In [None]:
# model 6 => stacked cnn and bilstm
model6 = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    Conv1D(64, 3, activation='relu'),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
model6.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model6.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 709ms/step - accuracy: 0.7966 - loss: 0.3886 - val_accuracy: 0.9041 - val_loss: 0.2029
Epoch 2/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 707ms/step - accuracy: 0.9270 - loss: 0.1481 - val_accuracy: 0.8904 - val_loss: 0.1955
Epoch 3/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 701ms/step - accuracy: 0.9603 - loss: 0.1003 - val_accuracy: 0.9133 - val_loss: 0.1951
Epoch 4/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 703ms/step - accuracy: 0.9797 - loss: 0.0594 - val_accuracy: 0.9046 - val_loss: 0.3140
Epoch 5/5
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 703ms/step - accuracy: 0.9878 - loss: 0.0371 - val_accuracy: 0.8980 - val_loss: 0.4366


<keras.src.callbacks.history.History at 0x7946c7ec8cd0>

In [None]:
loss, accuracy = model6.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 138ms/step - accuracy: 0.8964 - loss: 0.4291
Test Accuracy: 0.90


In [22]:
def predict_stress(sentence):
    sentence = clean_text(sentence)
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(sequence, maxlen=max_len, padding='post')
    # change the model name accordingly.
    prediction = model5.predict(padded_seq)[0][0]
    return "Stressful" if prediction > 0.5 else "Not Stressful"

In [23]:
new_sentence = "I can’t sleep at night and my mind feels constantly overwhelmed."
print(f"Prediction: {predict_stress(new_sentence)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 492ms/step
Prediction: Stressful
