In [72]:
###########################################
# ✅ EMOTION DETECTION MODEL (GloVe + BiLSTM)
# Dataset: dair-ai/emotion (Hugging Face)
# Model: BiLSTM + GloVe (100D)
###########################################

# ✅ STEP 0: Install if not already
# pip install datasets tensorflow numpy matplotlib

from datasets import load_dataset
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
import matplotlib.pyplot as plt
from collections import Counter
import pickle

# ✅ STEP 1: Load Dataset
print("\n📥 Loading dataset from Hugging Face...")
dataset = load_dataset("dair-ai/emotion")
train_ds, val_ds, test_ds = dataset["train"], dataset["validation"], dataset["test"]
label_names = dataset["train"].features["label"].names
num_classes = len(label_names)

# ✅ STEP 2: Tokenizer on all text (train + val + test)
print("\n🔠 Tokenizing text...")
all_texts = [ex["text"] for ex in train_ds] + \
            [ex["text"] for ex in val_ds] + \
            [ex["text"] for ex in test_ds]

tokenizer = Tokenizer(oov_token="<OOV>", num_words=10000)
tokenizer.fit_on_texts(all_texts)


📥 Loading dataset from Hugging Face...

🔠 Tokenizing text...


In [73]:
# ✅ Prepare sequences
max_len = 50
def to_seq(ds): return pad_sequences(tokenizer.texts_to_sequences([ex["text"] for ex in ds]), maxlen=max_len)

x_train = to_seq(train_ds)
x_val = to_seq(val_ds)
x_test = to_seq(test_ds)

y_train = np.array([ex["label"] for ex in train_ds])
y_val = np.array([ex["label"] for ex in val_ds])
y_test = np.array([ex["label"] for ex in test_ds])

In [74]:
# ✅ STEP 3: Load GloVe Embeddings
print("\n📚 Loading GloVe embeddings...")
embedding_dim = 100
vocab_size = 10000
embedding_matrix = np.zeros((vocab_size, embedding_dim))

with open("glove.6B.100d.txt", encoding='utf-8') as f:
    for line in f:
        parts = line.split()
        word = parts[0]
        vector = np.asarray(parts[1:], dtype='float32')
        idx = tokenizer.word_index.get(word)
        if idx is not None and idx < vocab_size:
            embedding_matrix[idx] = vector


📚 Loading GloVe embeddings...


In [75]:
# ✅ STEP 4: Build BiLSTM Model
print("\n🧠 Building model...")
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=True),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])



🧠 Building model...


In [76]:
# ✅ STEP 5: Compile Model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [77]:
# ✅ STEP 6: Train Model
print("\n🚀 Training model...")
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
history = model.fit(x_train, y_train,
                    validation_data=(x_val, y_val),
                    epochs=25,
                    batch_size=128,
                    callbacks=[early_stop])


🚀 Training model...
Epoch 1/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 359ms/step - accuracy: 0.2741 - loss: 1.7007 - val_accuracy: 0.3955 - val_loss: 1.5536
Epoch 2/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 336ms/step - accuracy: 0.4016 - loss: 1.5461 - val_accuracy: 0.5110 - val_loss: 1.3567
Epoch 3/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 341ms/step - accuracy: 0.4996 - loss: 1.3762 - val_accuracy: 0.5240 - val_loss: 1.2813
Epoch 4/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 337ms/step - accuracy: 0.5243 - loss: 1.2988 - val_accuracy: 0.5435 - val_loss: 1.2055
Epoch 5/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 340ms/step - accuracy: 0.5542 - loss: 1.2177 - val_accuracy: 0.5540 - val_loss: 1.1505
Epoch 6/25
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 338ms/step - accuracy: 0.5579 - loss: 1.1741 - val_accuracy: 0.5610 - val_lo

In [78]:
# ✅ STEP 7: Evaluate Model
print("\n📊 Evaluating...")
loss, acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {acc:.2f}")


📊 Evaluating...
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 59ms/step - accuracy: 0.8532 - loss: 0.4988
Test Accuracy: 0.85


In [79]:
# ✅ STEP 8: Save Artifacts
print("\n💾 Saving model & tokenizer...")
model.save("emotion_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_names.pkl", "wb") as f:
    pickle.dump(label_names, f)




💾 Saving model & tokenizer...


In [87]:
from rapidfuzz import fuzz
from rapidfuzz import process

# ✅ Create a set of all known words from tokenizer
known_words = set(tokenizer.word_index.keys())

# ✅ Function to correct each word using fuzzy matching
def correct_spelling(text, threshold=80):
    corrected_words = []
    for word in text.split():
        if word.lower() in known_words:
            corrected_words.append(word)
        else:
            # Use fuzzy matching to find closest word in vocab
            best_match = process.extractOne(word.lower(), known_words, scorer=fuzz.ratio)
            if best_match and best_match[1] >= threshold:
                corrected_words.append(best_match[0])
            else:
                corrected_words.append(word)  # Keep as is if no good match
    return " ".join(corrected_words)

# ✅ Enhanced Emotion Prediction with Fuzzy Correction
def predict_emotion(text):
    print(f"\n📝 Original Text: {text}")

    corrected_text = correct_spelling(text)
    print(f"🔧 Corrected Text: {corrected_text}")

    seq = tokenizer.texts_to_sequences([corrected_text])
    padded = pad_sequences(seq, maxlen=max_len)

    pred = model.predict(padded)[0]
    label = label_names[np.argmax(pred)]

    print(f"🎭 Predicted Emotion: {label} (Confidence: {np.max(pred):.2f})")

# ✅ Test Cases
predict_emotion("I am so happy and excited today!")
predict_emotion("I hate everything about this terible day.")
predict_emotion("It is a grate dissapointment.")
predict_emotion("I’m fellng so hopless nd alone")  # Test edge-case typos



📝 Original Text: I am so happy and excited today!
🔧 Corrected Text: I am so happy and excited today
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
🎭 Predicted Emotion: joy (Confidence: 1.00)

📝 Original Text: I hate everything about this terible day.
🔧 Corrected Text: I hate everything about this terrible day
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step
🎭 Predicted Emotion: sadness (Confidence: 0.89)

📝 Original Text: It is a grate dissapointment.
🔧 Corrected Text: It is a gate disappointment
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
🎭 Predicted Emotion: sadness (Confidence: 0.33)

📝 Original Text: I’m fellng so hopless nd alone
🔧 Corrected Text: im felling so hopeless nd alone
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
🎭 Predicted Emotion: sadness (Confidence: 0.99)


In [None]:
# | Step | Description                           |
# | ---- | ------------------------------------- |
# | 1    | Load & explore labeled tweets dataset |
# | 2    | Tokenize words → integers             |
# | 3    | Load GloVe embeddings (100D)          |
# | 4    | Build embedding matrix                |
# | 5    | Create LSTM model using embeddings    |
# | 6    | Train & evaluate                      |
# | 7    | Predict emotion from any new text     |


In [57]:
#  Step 1: Load Dataset
# dataset = load_dataset("dair-ai/emotion")
# train_ds, val_ds, test_ds = dataset["train"], dataset["validation"], dataset["test"]
# We load a dataset of tweets labeled with 6 emotions:

# 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'

# It returns 3 parts: train_ds, val_ds, test_ds (for training, validation, and testing)

# 📌 Example sample:
# {'text': "i'm feeling quite sad and depressed", 'label': 0}

#     🔹 Step 2: Explore the Dataset
# labels = [label_names[ex["label"]] for ex in train_ds]
# Extracts the label names from all training samples.

# Then uses Counter() + matplotlib to plot the distribution of emotions.

# 📊 Helps you know how balanced the dataset is.

#     🔹 Step 3: Tokenize and Pad Texts
# tokenizer = Tokenizer(oov_token="<OOV>")
# tokenizer.fit_on_texts(texts)
# Creates a word-to-index mapping:
# e.g., "happy" → 57, "love" → 89

# <OOV> means "out of vocabulary" — used for unknown words

# Then we convert text to sequences of numbers:
# x_train_seq = tokenizer.texts_to_sequences(...)
# x_train = pad_sequences(x_train_seq, maxlen=50)
# Converts:
# "I am sad today" → [1, 6, 57, 22] → [1, 6, 57, 22, 0, 0, ..., 0] (padded to 50)
# This ensures all input texts are same length for training
# Step 4: Load GloVe Word Embeddings
# with open("glove.6B.100d.txt") as f:
#     glove_embeddings[word] = vector

# Loads 400,000 English words, each mapped to a 100-dimensional vector

# Example:
# "happy" → [0.12, -0.44, ..., 0.27]
# These vectors capture meaning and relationships between words
# Step 5: Build the Embedding Matrix
# embedding_matrix = np.zeros((vocab_size, 100))
# for word, i in tokenizer.word_index.items():
#     embedding_matrix[i] = glove_embeddings.get(word)

# Maps each word in our dataset to its GloVe vector

# If GloVe has no vector, it stays as zeros

# 📘 Why? So our model uses semantic meaning of words from the start, instead of learning them from scratch.

# 🔹 Step 6: Define the Model
# model = Sequential([
#     Embedding(..., weights=[embedding_matrix], trainable=False),
#     LSTM(64),
#     Dense(64, activation='relu'),
#     Dense(num_classes, activation='softmax')
# ])

# | Layer             | Purpose                                                            |
# | ----------------- | ------------------------------------------------------------------ |
# | **Embedding**     | Converts word indices into their 100D vector meanings (from GloVe) |
# | **LSTM(64)**      | Reads the word vectors as a sequence and remembers context         |
# | **Dense(64)**     | Learns deep patterns from LSTM output                              |
# | **Dense(output)** | Outputs 6 numbers = probability of each emotion (via softmax)      |


# Step 7: Compile and Train the Model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# history = model.fit(...)

# sparse_categorical_crossentropy is used because your labels are integers (not one-hot)

# Trains for 5 epochs

# 🔹 Step 8: Evaluate the Model
# test_loss, test_acc = model.evaluate(x_test, np.array(y_test))

# 🔹 Step 9: Predict Emotion of New Text
# def predict_emotion(text):
#     seq = tokenizer.texts_to_sequences([text])
#     padded = pad_sequences(seq, maxlen=max_len)
#     pred = model.predict(padded)[0]
#     label = label_names[np.argmax(pred)]

#     Converts input text to token sequence

# Pads it to max length (50)

# Feeds it to the model

# Gets output like:
# [0.01, 0.03, 0.92, 0.01, 0.01, 0.02] → 'joy'

# 🔮 Final Output Example
# predict_emotion("I am so happy and excited today!")

# 📝 Text: I am so happy and excited today!
# 🎭 Predicted Emotion: joy (Confidence: 0.93)


SyntaxError: invalid syntax (2473639437.py, line 1)

In [None]:
# | Layer No. | Layer Type              | Description                                                                              |
# | --------- | ----------------------- | ---------------------------------------------------------------------------------------- |
# | 1️⃣       | **Embedding**           | Maps each word (by index) to a 100D GloVe vector.                                        |
# | 2️⃣       | **LSTM(64)**            | Processes the word vector sequence to capture **temporal context** (word order, memory). |
# | 3️⃣       | **Dense(64)**           | Fully connected layer to learn complex features from LSTM output.                        |
# | 4️⃣       | **Dense(num\_classes)** | Final layer to predict probability for each emotion class using **softmax**.             |
