In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
le=LabelEncoder()

In [3]:
df = pd.read_csv("train-00000-of-00001.csv")

print(df['label'].value_counts())
df.head()

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64


Unnamed: 0,text,label
0,i feel like theres way too much im trying to a...,3
1,i have been feeling gloomy since monday,0
2,i feel that anger toward someone else not cari...,2
3,i try not to think about it because identifyin...,0
4,i feel so at peace and less stressed now that ...,0


In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Aman
[nltk_data]     Jha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aman
[nltk_data]     Jha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:

stop = set(stopwords.words('english'))
negations = {"no", "not", "nor", "never","didnt","don't","didnt", "doesn't", "didn't",
             "hadn't", "won't", "can't", "couldn't", "shouldn't" ,
             "wouldn't", "ain't", "left"}
stop = stop - negations

def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop])
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text



In [7]:
df['text'] = df['text'].apply(clean_text)
X = df['text']
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
num_classes = len(le.classes_)


X_train shape: (333447,)
X_test shape: (83362,)
y_train shape: (333447,)
y_test shape: (83362,)


In [8]:
MAX_NUM_WORDS = 25000
MAX_LEN = 70

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post", truncating="post")


In [9]:
class EmotionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = EmotionDataset(X_train_pad, y_train_enc)
test_ds = EmotionDataset(X_test_pad, y_test_enc)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=128)

In [10]:
from torch.utils.data import WeightedRandomSampler

class_counts = df['label'].value_counts().sort_index()
class_weights = 1. / class_counts
sample_weights = [class_weights[y] for y in y_train]

sampler = WeightedRandomSampler(sample_weights, len(sample_weights))
train_loader = DataLoader(train_ds, batch_size=64, sampler=sampler)
embedding_index = {}
vocab_size = min(MAX_NUM_WORDS, len(tokenizer.word_index) + 1)

with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        vector = embedding_index.get(word)
        if vector is not None:
            embedding_matrix[i] = vector

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [11]:
import torch
import torch.nn as nn
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float),
            freeze=False, 
            padding_idx=0
        )
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        attn_weights = torch.softmax(self.attention(out), dim=1)
        context = torch.sum(attn_weights * out, dim=1)
        context = self.dropout(context)
        return self.fc(context)

model = SentimentBiLSTM(vocab_size, embed_dim=100, hidden_dim=128, num_classes=num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [12]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = min(MAX_NUM_WORDS, len(tokenizer.word_index)+1)
model = SentimentBiLSTM(vocab_size, embed_dim=100, hidden_dim=128, num_classes=num_classes).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [13]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [14]:
epochs = 30
early_stopper = EarlyStopping(patience=5)  
for epoch in range(epochs):

    model.train()
    total_loss, total_correct = 0, 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_correct += (logits.argmax(1) == yb).sum().item()
    train_acc = total_correct / len(train_ds)

    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss = criterion(logits, yb)
            val_loss += loss.item()
            val_correct += (logits.argmax(1) == yb).sum().item()
    val_acc = val_correct / len(test_ds)
            
    print(f"Epoch {epoch+1}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}, val_loss={val_loss/len(test_loader):.4f}")

    early_stopper(val_loss)
    if early_stopper.early_stop:
        print("⏹ Early stopping triggered")
        break

Epoch 1: train_acc=0.9322, val_acc=0.9388, val_loss=0.1221
Epoch 2: train_acc=0.9554, val_acc=0.9390, val_loss=0.1158
Epoch 3: train_acc=0.9568, val_acc=0.9395, val_loss=0.1125
Epoch 4: train_acc=0.9570, val_acc=0.9384, val_loss=0.1233
Epoch 5: train_acc=0.9573, val_acc=0.9366, val_loss=0.1227
Epoch 6: train_acc=0.9583, val_acc=0.9379, val_loss=0.1275
Epoch 7: train_acc=0.9591, val_acc=0.9353, val_loss=0.1335
Epoch 8: train_acc=0.9596, val_acc=0.9343, val_loss=0.1288
⏹ Early stopping triggered


In [15]:
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        preds = logits.argmax(1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(
    all_labels,
    all_preds,
    target_names=[str(c) for c in le.classes_]
))

Accuracy: 0.9342746095343202
              precision    recall  f1-score   support

           0       1.00      0.96      0.97     24414
           1       1.00      0.91      0.95     28058
           2       0.77      1.00      0.87      6863
           3       0.92      0.96      0.94     11454
           4       0.88      0.88      0.88      9562
           5       0.71      0.90      0.79      3011

    accuracy                           0.93     83362
   macro avg       0.88      0.93      0.90     83362
weighted avg       0.94      0.93      0.94     83362

