# Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Loading The Dataset

In [2]:
data = pd.read_csv(r"C:\Users\abhis\Downloads\IMDB-Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

# Text Preprocessing

In [4]:
# Encode sentiment (positive -> 1, negative -> 0)
data.sentiment.replace({'positive': 1, 'negative': 0}, inplace=True)

In [5]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


## Removing HTML Tags

In [7]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)
data['review'] = data['review'].apply(remove_html_tags)

## Removing Special Characters

In [8]:
def remove_special_chars(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)
data['review'] = data['review'].apply(remove_special_chars)

## Converting to Lower Case

In [9]:
def convert_to_lowercase(text):
    return text.lower()
data['review'] = data['review'].apply(convert_to_lowercase)

## Removing Stop Words

In [10]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)
data['review'] = data['review'].apply(remove_stopwords)

## Stemming

In [11]:
def apply_stemming(text):
    stemmer = SnowballStemmer('english')
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)
data['review'] = data['review'].apply(apply_stemming)

In [12]:
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod youll hook ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


## Tokenization

In [13]:
# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Limit to 10,000 most frequent words
tokenizer.fit_on_texts(data['review'])
sequences = tokenizer.texts_to_sequences(data['review'])

In [14]:
# Padding sequences
X = pad_sequences(sequences, maxlen=200)  # Ensure all sequences are 200 words long
y = data['sentiment'].values

# Model Building

In [15]:
# Train-test split
trainx, testx, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shapes: X={trainx.shape}, y={trainy.shape}")
print(f"Test shapes: X={testx.shape}, y={testy.shape}")

Train shapes: X=(40000, 200), y=(40000,)
Test shapes: X=(10000, 200), y=(10000,)


In [16]:
# Define LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),  # Embedding layer
    LSTM(128, return_sequences=True),  # First LSTM layer
    Dropout(0.2),  # Dropout for regularization
    LSTM(64),  # Second LSTM layer
    Dropout(0.2),  # Dropout for regularization
    Dense(1, activation='sigmoid')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])





In [17]:
# Train the model
history = model.fit(trainx, trainy, epochs=5, batch_size=64, validation_data=(testx, testy))

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 257ms/step - accuracy: 0.7852 - loss: 0.4363 - val_accuracy: 0.8809 - val_loss: 0.2835
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 261ms/step - accuracy: 0.9095 - loss: 0.2347 - val_accuracy: 0.8773 - val_loss: 0.3038
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 264ms/step - accuracy: 0.9355 - loss: 0.1710 - val_accuracy: 0.8814 - val_loss: 0.3095
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 270ms/step - accuracy: 0.9571 - loss: 0.1233 - val_accuracy: 0.8777 - val_loss: 0.3455
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 271ms/step - accuracy: 0.9733 - loss: 0.0799 - val_accuracy: 0.8707 - val_loss: 0.4201


# Model Evaluation

In [18]:
# Evaluate on test data
loss, accuracy = model.evaluate(testx, testy)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report
y_pred = (model.predict(testx) > 0.5).astype("int32")
print(classification_report(testy, y_pred, target_names=['Negative', 'Positive']))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 53ms/step - accuracy: 0.8735 - loss: 0.4102
Test Accuracy: 0.8707
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 54ms/step
              precision    recall  f1-score   support

    Negative       0.90      0.83      0.86      4961
    Positive       0.85      0.91      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



# Saving the Model

In [19]:
import pickle

# Save the model
model.save('lstm_sentiment_model.h5')

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)



# Testing on New Data

In [20]:
# Load saved model and tokenizer
from tensorflow.keras.models import load_model

model = load_model('lstm_sentiment_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Preprocess and predict sentiment for a new review
def preprocess_review(review):
    review = remove_html_tags(review)
    review = remove_special_chars(review)
    review = convert_to_lowercase(review)
    review = remove_stopwords(review)
    review = apply_stemming(review)
    return review

new_review = """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy."""
cleaned_review = preprocess_review(new_review)
sequence = tokenizer.texts_to_sequences([cleaned_review])
padded_sequence = pad_sequences(sequence, maxlen=200)

# Prediction
prediction = model.predict(padded_sequence)
print("Sentiment:", "Positive" if prediction > 0.5 else "Negative")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step
Sentiment: Negative
