### Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

### Dataset

In [4]:
df=pd.read_csv("C:/Users/ASUS/Downloads/Datasets/text.csv")
df

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4
...,...,...,...
416804,416804,i feel like telling these horny devils to find...,2
416805,416805,i began to realize that when i was feeling agi...,3
416806,416806,i feel very curious be why previous early dawn...,5
416807,416807,i feel that becuase of the tyranical nature of...,3


### Preprocessing

In [6]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [7]:
stemmer = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [8]:
def clean_and_preprocess(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

In [9]:
df['cleaned_text'] = df['text'].apply(clean_and_preprocess)
df

Unnamed: 0.1,Unnamed: 0,text,label,cleaned_text
0,0,i just feel really helpless and heavy hearted,4,feel realli helpless heavi heart
1,1,ive enjoyed being able to slouch about relax a...,0,ive enjoy abl slouch relax unwind frankli need...
2,2,i gave up my internship with the dmrg and am f...,4,gave internship dmrg feel distraught
3,3,i dont know i feel so lost,0,dont know feel lost
4,4,i am a kindergarten teacher and i am thoroughl...,4,kindergarten teacher thoroughli weari job take...
...,...,...,...,...
416804,416804,i feel like telling these horny devils to find...,2,feel like tell horni devil find site suit sort...
416805,416805,i began to realize that when i was feeling agi...,3,began realiz feel agit restless would thought ...
416806,416806,i feel very curious be why previous early dawn...,5,feel curiou previou earli dawn time seek troubl
416807,416807,i feel that becuase of the tyranical nature of...,3,feel becuas tyran natur govern el salvador sav...


### Tokenization & Padding 

In [11]:
tokenizer = Tokenizer(num_words=50000 , oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
sequences = [[token if token < 416809  else 1 for token in seq] for seq in sequences]

X = pad_sequences(sequences, maxlen=200, padding='post')

###  Label Encoding

In [13]:
y = LabelEncoder().fit_transform(df['label']) 

### Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

### BI-LSTM Model

In [17]:
vocab_size = min(len(tokenizer.word_index) + 1, 50000)
embedding_dim = 128 
maxlen = 300 
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=False)),  
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(6, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



### Train the Model

In [19]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)

Epoch 1/5
[1m18236/18236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3444s[0m 189ms/step - accuracy: 0.8988 - loss: 0.2376 - val_accuracy: 0.9201 - val_loss: 0.1493
Epoch 2/5
[1m18236/18236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3240s[0m 178ms/step - accuracy: 0.9237 - loss: 0.1499 - val_accuracy: 0.9227 - val_loss: 0.1417
Epoch 3/5
[1m18236/18236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3270s[0m 179ms/step - accuracy: 0.9281 - loss: 0.1370 - val_accuracy: 0.9226 - val_loss: 0.1410
Epoch 4/5
[1m18236/18236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3344s[0m 183ms/step - accuracy: 0.9308 - loss: 0.1274 - val_accuracy: 0.9227 - val_loss: 0.1468
Epoch 5/5
[1m18236/18236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3356s[0m 184ms/step - accuracy: 0.9332 - loss: 0.1208 - val_accuracy: 0.9207 - val_loss: 0.1512


### Evaluate the Model

In [21]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 41ms/step - accuracy: 0.9207 - loss: 0.1512
Test Accuracy: 0.92
