In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump

In [2]:
df = pd.read_csv(r"D:\BIA\Project 2 - Twitter Sentiment Analysis\Twitter_Data.csv", encoding="utf-8")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
ds = df[['text', 'sentiment']].dropna()
ds.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [4]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = text.lower().strip()
    return text

ds['clean_text'] = ds['text'].astype(str).apply(clean_text)

In [5]:
ds.head()

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",neutral,id have responded if i were going
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad i will miss you here in san diego
2,my boss is bullying me...,negative,my boss is bullying me
3,what interview! leave me alone,negative,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons of why couldnt they put them on the rele...


In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(ds['sentiment'])
print("Label order:", label_encoder.classes_)   # very important

Label order: ['negative' 'neutral' 'positive']


In [7]:
y_categorical = tf.keras.utils.to_categorical(y_encoded)

In [8]:
vocab_size = 20000
max_len = 50

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(ds['clean_text'])
X_seq = tokenizer.texts_to_sequences(ds['clean_text'])
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post', truncating='post')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_pad, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

In [11]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_encoded),
    y=y_encoded
)

In [12]:
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

Class Weights: {0: np.float64(1.177226577560725), 1: np.float64(0.8239632994512908), 2: np.float64(1.0673502680027966)}


In [15]:
embeddings_index = {}
with open("glove.6B.50d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [17]:
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [18]:
model = Sequential([
    Embedding(vocab_size, 50, weights=[embedding_matrix],
              input_length=max_len, trainable=False),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(y_categorical.shape[1], activation='softmax')
])



In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [20]:
history = model.fit(
    X_train, y_train,
    epochs=12,
    batch_size=64,
    validation_split=0.2,
    class_weight=class_weights,
    verbose=1
)

Epoch 1/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 57ms/step - accuracy: 0.5323 - loss: 0.9475 - val_accuracy: 0.5981 - val_loss: 0.8716
Epoch 2/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 63ms/step - accuracy: 0.6097 - loss: 0.8463 - val_accuracy: 0.6118 - val_loss: 0.8380
Epoch 3/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 66ms/step - accuracy: 0.6393 - loss: 0.7988 - val_accuracy: 0.6498 - val_loss: 0.7970
Epoch 4/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 67ms/step - accuracy: 0.6530 - loss: 0.7719 - val_accuracy: 0.6575 - val_loss: 0.7730
Epoch 5/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 67ms/step - accuracy: 0.6687 - loss: 0.7481 - val_accuracy: 0.6609 - val_loss: 0.7615
Epoch 6/12
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 68ms/step - accuracy: 0.6793 - loss: 0.7285 - val_accuracy: 0.6639 - val_loss: 0.7786
Epoch 7/12
[1m2

In [21]:
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {acc*100:.2f}%")

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.6930 - loss: 0.7374
Test Accuracy: 69.30%


In [22]:
model.save("sentiment_bilstm_glov.h5")
dump(tokenizer, "sentiment_tokenizer_glov.joblib")

print("✅ Model and tokenizer saved successfully!")
print("Use CLASS_LABELS =", list(label_encoder.classes_), "in your app.py file.")



✅ Model and tokenizer saved successfully!
Use CLASS_LABELS = ['negative', 'neutral', 'positive'] in your app.py file.
