# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from textblob import TextBlob

# Download necessary NLTK data


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load built-in IMDB dataset from TensorFlow

In [None]:
imdb = tf.keras.datasets.imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)

# Word Index


In [None]:
word_index = imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


# Preprocessing: Decode the reviews back into text (for TextBlob)

In [None]:
reverse_word_index = {value: key for key, value in word_index.items()}

In [None]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in text])

# Preprocess Dataset for LSTM

In [None]:
# Preprocess Dataset for LSTM
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return ' '.join(text)

# Decode and preprocess X_train and X_test for TextBlob

In [None]:
X_train_text = [' '.join([reverse_word_index.get(i - 3, '?') for i in text]) for text in X_train]
X_test_text = [' '.join([reverse_word_index.get(i - 3, '?') for i in text]) for text in X_test]

# Padding the sequences for LSTM model

In [None]:
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

# LSTM Model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



# Train Model

In [None]:

history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=2)])

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 514ms/step - accuracy: 0.7074 - loss: 0.5449 - val_accuracy: 0.8045 - val_loss: 0.4307
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 419ms/step - accuracy: 0.8511 - loss: 0.3480 - val_accuracy: 0.8343 - val_loss: 0.3865
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 357ms/step - accuracy: 0.8766 - loss: 0.3009 - val_accuracy: 0.8327 - val_loss: 0.3980
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 358ms/step - accuracy: 0.8871 - loss: 0.2690 - val_accuracy: 0.8439 - val_loss: 0.3615
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 342ms/step - accuracy: 0.9040 - loss: 0.2364 - val_accuracy: 0.8437 - val_loss: 0.3991


# Evaluate Model

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test, y_pred))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 60ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     12500
           1       0.86      0.82      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



# LSTM Hyperparameter Tuning (changing dropout rates and layers)

In [None]:
model_tuned = Sequential()
model_tuned.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model_tuned.add(LSTM(units=128, return_sequences=True))
model_tuned.add(Dropout(0.4))
model_tuned.add(LSTM(units=64))
model_tuned.add(Dropout(0.3))
model_tuned.add(Dense(1, activation='sigmoid'))

model_tuned.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history_tuned = model_tuned.fit(X_train, y_train, epochs=7, batch_size=64, validation_data=(X_test, y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=2)])

Epoch 1/7




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 608ms/step - accuracy: 0.7151 - loss: 0.5278 - val_accuracy: 0.8434 - val_loss: 0.3548
Epoch 2/7
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 581ms/step - accuracy: 0.8779 - loss: 0.2986 - val_accuracy: 0.8489 - val_loss: 0.3431
Epoch 3/7
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 496ms/step - accuracy: 0.9054 - loss: 0.2398 - val_accuracy: 0.8403 - val_loss: 0.3755
Epoch 4/7
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 586ms/step - accuracy: 0.9226 - loss: 0.1932 - val_accuracy: 0.8458 - val_loss: 0.4137


# Evaluate Tuned Model

In [None]:
y_pred_tuned = (model_tuned.predict(X_test) > 0.5).astype("int32")
print("Tuned LSTM Classification Report:")
print(classification_report(y_test, y_pred_tuned))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 82ms/step
Tuned LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85     12500
           1       0.87      0.82      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Compare LSTM with TextBlob

In [None]:
def textblob_sentiment(text):
    analysis = TextBlob(text)
    return 1 if analysis.sentiment.polarity >= 0 else 0

df_test = pd.DataFrame(X_test_text, columns=['review'])
df_test['textblob_prediction'] = df_test['review'].apply(textblob_sentiment)

print("TextBlob Classification Report:")
print(classification_report(y_test, df_test['textblob_prediction']))

TextBlob Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.43      0.58     12500
           1       0.63      0.95      0.76     12500

    accuracy                           0.69     25000
   macro avg       0.76      0.69      0.67     25000
weighted avg       0.76      0.69      0.67     25000



### Applications of Sentiment Analysis:
1. **Product and service reviews**: Monitoring customer reviews and feedback.
2. **Market research**: Analyzing trends and public opinion on products or events.
3. **Social media monitoring**: Detecting public sentiment on platforms like Twitter.
4. **Customer service**: Understanding customer satisfaction and areas for improvement.

### Challenges in Sentiment Analysis:
1. **Sarcasm and irony**: Hard for models to detect such subtleties.
2. **Contextual interpretation**: Words may have different meanings depending on context.
3. **Ambiguity in neutral statements**: Hard to classify whether neutral statements contain any sentiment.
4. **Language nuances**: Variations in slang, idioms, and dialects can lead to inaccuracies.