In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = "/content/Stress.csv"
df = pd.read_csv(file_path)

In [3]:
def clean_text(text):
    text = text.lower()  # converting to lowercase
    text = re.sub(r'\W', ' ', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

In [4]:
# text preprocessing
df['clean_text'] = df['text'].astype(str).apply(clean_text)

In [5]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

In [7]:
tokenizer = Tokenizer(num_words=5000)  # uses the top 5000 words
tokenizer.fit_on_texts(X_train)

In [8]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [9]:
max_len = max(len(seq) for seq in X_train_seq)  # find max length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [10]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),  # embedding layer
    Conv1D(filters=128, kernel_size=5, activation='relu'),  # convolution layer
    MaxPooling1D(pool_size=2),  # max pooling
    Dropout(0.3),
    Flatten(),  # flatten to feed into dense layers
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # output layer (binary)
])



In [11]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

Epoch 1/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 176ms/step - accuracy: 0.5291 - loss: 0.6843 - val_accuracy: 0.7130 - val_loss: 0.5653
Epoch 2/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 155ms/step - accuracy: 0.7894 - loss: 0.4660 - val_accuracy: 0.7377 - val_loss: 0.5456
Epoch 3/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 151ms/step - accuracy: 0.9326 - loss: 0.2158 - val_accuracy: 0.7306 - val_loss: 0.6648
Epoch 4/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 151ms/step - accuracy: 0.9876 - loss: 0.0617 - val_accuracy: 0.7113 - val_loss: 0.9642
Epoch 5/5
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 153ms/step - accuracy: 0.9960 - loss: 0.0272 - val_accuracy: 0.7342 - val_loss: 0.9954


<keras.src.callbacks.history.History at 0x7b1f21ec8710>

In [13]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.7313 - loss: 1.0229
Test Accuracy: 0.73


In [14]:
def predict_stress(sentence):
    sentence = clean_text(sentence)
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(sequence, maxlen=max_len, padding='post')
    prediction = model.predict(padded_seq)[0][0]
    return "Stressful" if prediction > 0.5 else "Not Stressful"

In [15]:
new_sentence = "I can’t sleep at night and my mind feels constantly overwhelmed."
print(f"Prediction: {predict_stress(new_sentence)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
Prediction: Stressful
