## Objective:
#### Develop a deep learning model using LSTM architecture to classify sequential/text data into two classes, focusing on achieving robust performance and interpretability using neural network techniques.

### Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

### Dataset

In [4]:
df=pd.read_csv("C:/Users/ASUS/Downloads/Datasets/IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Preprocessing

In [6]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [7]:
stemmer = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [8]:
def clean_and_preprocess(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

In [9]:
df['cleaned_review'] = df['review'].apply(clean_and_preprocess)
df

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod youll hook ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job wasnt creativ orig...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,negative,im go disagre previou comment side maltin one ...


### Tokenization & Padding 

In [11]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_review'])

X = tokenizer.texts_to_sequences(df['cleaned_review'])
X = pad_sequences(X, maxlen=200, padding='post')

###  Label Encoding

In [13]:
y = LabelEncoder().fit_transform(df['sentiment']) 

### Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

### LSTM Model

In [17]:
model = Sequential([
    Embedding(input_dim=50000, output_dim=64),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### Train the Model

In [19]:
history = model.fit(
    X_train, y_train,
    epochs=15,             
    batch_size=128,
    validation_split=0.3,
    verbose=1)

Epoch 1/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 179ms/step - accuracy: 0.5091 - loss: 0.6944 - val_accuracy: 0.4947 - val_loss: 0.6947
Epoch 2/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 176ms/step - accuracy: 0.5245 - loss: 0.6921 - val_accuracy: 0.5142 - val_loss: 0.6877
Epoch 3/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 175ms/step - accuracy: 0.5457 - loss: 0.6760 - val_accuracy: 0.5437 - val_loss: 0.6646
Epoch 4/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 177ms/step - accuracy: 0.5595 - loss: 0.6300 - val_accuracy: 0.5427 - val_loss: 0.6799
Epoch 5/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 194ms/step - accuracy: 0.6158 - loss: 0.5832 - val_accuracy: 0.8099 - val_loss: 0.4624
Epoch 6/15
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 209ms/step - accuracy: 0.8652 - loss: 0.3683 - val_accuracy: 0.8596 - val_loss: 0.3689
Epoch 7/15

### Evaluate the Model

In [21]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.8550 - loss: 0.6402
Test Accuracy: 0.85


## Summary of Results:
#### - Model Architecture: Embedding → LSTM → Dropout → Dense layers
#### - Dataset Size: ~50,000 samples
#### - Data Split: 70% training, 30% testing
#### - Training Setup: 15 epochs, batch size = 128, validation_split = 0.3
#### - Best Validation Accuracy: ~86.9%
#### - Final Test Accuracy: **85.5%**
#### - Model showed strong generalization and consistent convergence across epochs