In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

# ✅ Load dataset
file_path = "enhanced_event_reviews.csv"
df = pd.read_csv(file_path)

# ✅ Preprocessing Function
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic chars
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    return ""

# ✅ Apply Preprocessing
df['clean_review'] = df['review'].apply(clean_text)

# ✅ Map sentiment labels to integers
sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(sentiment_map)

# ✅ Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['sentiment'], test_size=0.2, random_state=42
)

# ✅ Tokenization and Padding
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

# ✅ Load GloVe Embeddings
glove_file = "glove.6B.100d.txt"
embedding_index = {}

with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs


word_index = tokenizer.word_index
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# ✅ Model Training
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_pad, y_test),
    callbacks=[early_stopping],
    verbose=1
)


loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f" Accuracy with GloVe: {accuracy:.4f}")


Epoch 1/20




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 179ms/step - accuracy: 0.4038 - loss: 1.0898 - val_accuracy: 0.5200 - val_loss: 0.9014
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 123ms/step - accuracy: 0.6291 - loss: 0.8318 - val_accuracy: 0.6350 - val_loss: 0.5546
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 125ms/step - accuracy: 0.6469 - loss: 0.5819 - val_accuracy: 0.6600 - val_loss: 0.4984
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 121ms/step - accuracy: 0.6977 - loss: 0.4918 - val_accuracy: 0.8400 - val_loss: 0.4216
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - accuracy: 0.7619 - loss: 0.4507 - val_accuracy: 0.8450 - val_loss: 0.3137
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 121ms/step - accuracy: 0.8056 - loss: 0.3765 - val_accuracy: 0.9950 - val_loss: 0.1044
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━

In [2]:
import numpy as np

# ✅ Make Predictions
y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)  # Get the class with highest probability

# ✅ Map Sentiment Labels
reverse_sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# ✅ Display Predictions with Actual Labels
print("\n🔥 Test Set Predictions:")
for i in range(10):  # Display first 10 predictions
    print(f"Review: {X_test.iloc[i]}")
    print(f"Actual: {reverse_sentiment_map[y_test.iloc[i]]}")
    print(f"Predicted: {reverse_sentiment_map[y_pred[i]]}")
    print("-" * 50)

# ✅ Calculate Accuracy on Test Set
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"\n🔥 Test Accuracy: {accuracy:.4f}")

# ✅ Classification Report
print("\n🔥 Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step

🔥 Test Set Predictions:
Review: it was just an ordinary experience the experience was frustrating due to poor organization
Actual: neutral
Predicted: neutral
--------------------------------------------------
Review: the event was poorly organized and chaotic the logistics were a mess which made the experience unpleasant
Actual: negative
Predicted: negative
--------------------------------------------------
Review: the decor was simple and clean nothing fancy i wouldnt recommend attending because of the lack of proper planning
Actual: neutral
Predicted: neutral
--------------------------------------------------
Review: great networking opportunities and friendly staff the entertainment and speakers were topnotch making the event worthwhile
Actual: positive
Predicted: positive
--------------------------------------------------
Review: the app for the event kept crashing the experience was frustrating due to poor o

In [3]:
# ✅ New Reviews
new_reviews = [
    "The waiter molested me in the washroom",
    "The waiter kissed me passionately in the washroom",
    "The experience was vey good"
]

# ✅ Preprocess New Reviews
new_reviews_clean = [clean_text(review) for review in new_reviews]

# ✅ Tokenize and Pad
new_reviews_seq = tokenizer.texts_to_sequences(new_reviews_clean)
new_reviews_pad = pad_sequences(new_reviews_seq, maxlen=MAX_SEQUENCE_LENGTH)

# ✅ Make Predictions
new_preds_probs = model.predict(new_reviews_pad)
new_preds = np.argmax(new_preds_probs, axis=1)

# ✅ Map Sentiment Labels
print("\n New Review Predictions:")
for i, review in enumerate(new_reviews):
    sentiment = reverse_sentiment_map[new_preds[i]]
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}")
    print("-" * 50)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step

 New Review Predictions:
Review: The waiter molested me in the washroom
Predicted Sentiment: negative
--------------------------------------------------
Review: The waiter kissed me passionately in the washroom
Predicted Sentiment: negative
--------------------------------------------------
Review: The experience was vey good
Predicted Sentiment: positive
--------------------------------------------------


In [9]:
tokenizer.word_index

{'the': 1,
 'was': 2,
 'and': 3,
 'but': 4,
 'event': 5,
 'not': 6,
 'very': 7,
 'sessions': 8,
 'experience': 9,
 'were': 10,
 'no': 11,
 'food': 12,
 'a': 13,
 'process': 14,
 'lacked': 15,
 'quality': 16,
 'great': 17,
 'speakers': 18,
 'waiting': 19,
 'boring': 20,
 'nothing': 21,
 'crowd': 22,
 'proper': 23,
 'seating': 24,
 'arrangement': 25,
 'uncomfortable': 26,
 'absolutely': 27,
 'fantastic': 28,
 'registration': 29,
 'mess': 30,
 'app': 31,
 'for': 32,
 'kept': 33,
 'crashing': 34,
 'highquality': 35,
 'workshops': 36,
 'interactive': 37,
 'excellent': 38,
 'beverage': 39,
 'service': 40,
 'subpar': 41,
 'disappointing': 42,
 'panel': 43,
 'discussion': 44,
 'okay': 45,
 'memorable': 46,
 'decent': 47,
 'engagement': 48,
 'average': 49,
 'bad': 50,
 'keynote': 51,
 'speech': 52,
 'inspiring': 53,
 'perfect': 54,
 'venue': 55,
 'with': 56,
 'ample': 57,
 'space': 58,
 'facilities': 59,
 'uninspiring': 60,
 'schedule': 61,
 'punctual': 62,
 'energy': 63,
 'session': 64,
 'info

In [4]:
model.save('sentiment_model_enhanced.h5')



In [5]:


import pickle

# Save the tokenizer
with open('tokenizer_enhanced.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
