### Importing Required Libraries

In [18]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

### Dataset

In [20]:
df=pd.read_csv("C:/Users/ASUS/Downloads/Datasets/binary_class.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Data Preprocessing

In [22]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
stemmer = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)
def clean_and_preprocess(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text
df['cleaned_review'] = df['review'].apply(clean_and_preprocess)
df

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod youll hook ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job wasnt creativ orig...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,negative,im go disagre previou comment side maltin one ...


### Tokenization & Padding 

In [24]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_review'])

sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
sequences = [[token if token < 10000 else 1 for token in seq] for seq in sequences]

X = pad_sequences(sequences, maxlen=200, padding='post')

###  Label Encoding

In [26]:
y = LabelEncoder().fit_transform(df['sentiment']) 

### Train-Test Split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

### LSTM Model

In [35]:
model = Sequential([
    Embedding(input_dim=50000, output_dim=64),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Train the Model

In [37]:
history = model.fit(
    X_train, y_train,
    epochs=15,             
    batch_size=128,
    validation_split=0.3,
    verbose=1)

Epoch 1/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 204ms/step - accuracy: 0.5118 - loss: 0.6920 - val_accuracy: 0.5230 - val_loss: 0.6878
Epoch 2/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 218ms/step - accuracy: 0.5448 - loss: 0.6745 - val_accuracy: 0.5453 - val_loss: 0.6641
Epoch 3/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 211ms/step - accuracy: 0.5533 - loss: 0.6379 - val_accuracy: 0.5323 - val_loss: 0.6681
Epoch 4/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 203ms/step - accuracy: 0.5666 - loss: 0.6138 - val_accuracy: 0.5410 - val_loss: 0.6865
Epoch 5/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 208ms/step - accuracy: 0.6893 - loss: 0.5394 - val_accuracy: 0.8198 - val_loss: 0.4478
Epoch 6/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 206ms/step - accuracy: 0.8729 - loss: 0.3327 - val_accuracy: 0.8653 - val_loss: 0.3352
Epoch 7/15

### Evaluate the Model

In [38]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.8611 - loss: 0.7094
Test Accuracy: 0.86
