In [2]:
# ===== 1) Import Libraries =====
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

from sklearn.metrics import classification_report, confusion_matrix

# Download punkt for tokenization if needed
nltk.download('punkt')
stop_words = set(stopwords.words("english"))
stemmer = LancasterStemmer()

# ===== 2) Load Data =====
train = pd.read_csv(r"D:\Nlp\twitter_training.csv")
valid = pd.read_csv(r"D:\Nlp\twitter_validation.csv")

train.columns = ["Id", "Entity", "Emotions", "Comments"]
valid.columns = ["Id", "Entity", "Emotions", "Comments"]

# ===== 3) Text Cleaning Function =====
def Notify(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'@\S+|http\S+|\.pic\S+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(words)

train["NotifyComment"] = train["Comments"].apply(Notify)
valid["NotifyComment"] = valid["Comments"].apply(Notify)

# ===== 4) Label Encoding =====
le = LabelEncoder()
train["Emotions"] = le.fit_transform(train["Emotions"])
valid["Emotions"] = le.transform(valid["Emotions"])

# ===== 5) Text to Sequences for LSTM =====
MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train["NotifyComment"])

X_train_seq = tokenizer.texts_to_sequences(train["NotifyComment"])
X_test_seq  = tokenizer.texts_to_sequences(valid["NotifyComment"])

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad  = pad_sequences(X_test_seq, maxlen=MAX_LEN)

y_train = train["Emotions"]
y_test  = valid["Emotions"]

# ===== 6) Build LSTM Model =====
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ===== 7) Train Model =====
history = model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=64)

# ===== 8) Evaluate Model =====
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=1)

print("=== Classification Report ===")
print(classification_report(y_test, y_pred_classes))

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred_classes))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\E_Magic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/5
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 139ms/step - accuracy: 0.6352 - loss: 0.8992 - val_accuracy: 0.8248 - val_loss: 0.4867
Epoch 2/5
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 143ms/step - accuracy: 0.7828 - loss: 0.5629 - val_accuracy: 0.8829 - val_loss: 0.3546
Epoch 3/5
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 147ms/step - accuracy: 0.8252 - loss: 0.4484 - val_accuracy: 0.8979 - val_loss: 0.3211
Epoch 4/5
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 141ms/step - accuracy: 0.8510 - loss: 0.3809 - val_accuracy: 0.8979 - val_loss: 0.3244
Epoch 5/5
[1m1167/1167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 140ms/step - accuracy: 0.8673 - loss: 0.3362 - val_accuracy: 0.9219 - val_loss: 0.3047
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step
=== Classification Report ===
              precision    recall  f1-score   support

      

In [4]:
model.save("lstm_model.keras")


In [5]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [8]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("labelencoder.pkl", "wb") as f:
    pickle.dump(le, f)

