In [15]:
!pip install gradio



## **IMPORTING LIBRARIES**

In [16]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import gradio as gr
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [24]:
# Download stopwords and punkt for more refined tokenization
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [25]:

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Datasets/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [26]:
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    words = nltk.word_tokenize(text.lower())
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    # Join words back into a string and strip extra spaces
    return ' '.join(words).strip()


In [27]:
# Apply preprocessing to reviews
df['cleaned_review'] = df['review'].apply(clean_text)

In [28]:
# Tokenization and padding sequences
tokenizer = Tokenizer(num_words=20000,oov_token="<oov>")  # Increase vocabulary size
tokenizer.fit_on_texts(df['cleaned_review'])
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
max_len = 250  # Increased max length for better context
X = pad_sequences(sequences, maxlen=max_len)

In [29]:
# Encode labels
y = np.where(df['sentiment'] == 'positive', 1, 0)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Save tokenizer for future use
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Improved preprocessing and tokenization complete.")

Improved preprocessing and tokenization complete.


In [33]:
# Build the LSTM model without GloVe
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=max_len))  # Trainable embedding layer
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Bidirectional LSTM for context understanding
model.add(Dropout(0.5))  # Regularization to prevent overfitting
model.add(Bidirectional(LSTM(64)))  # Additional LSTM layer
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))  # Fully connected layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification (positive/negative)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Save the trained model
model.save('lstm_sentiment_model.h5')

print("Model training complete. Model saved.")


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - accuracy: 0.7693 - loss: 0.4612 - val_accuracy: 0.8695 - val_loss: 0.3169
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 58ms/step - accuracy: 0.9209 - loss: 0.2164 - val_accuracy: 0.8849 - val_loss: 0.3220
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - accuracy: 0.9471 - loss: 0.1463 - val_accuracy: 0.8860 - val_loss: 0.3019
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - accuracy: 0.9689 - loss: 0.0952 - val_accuracy: 0.8712 - val_loss: 0.4007
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.9757 - loss: 0.0751 - val_accuracy: 0.8806 - val_loss: 0.4054
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 54ms/step - accuracy: 0.9833 - loss: 0.0529 - val_accuracy: 0.8735 - val_loss: 0.4626




Model training complete. Model saved.


In [34]:
# Load the saved model and tokenizer
model = load_model('lstm_sentiment_model.h5')
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Function for sentiment prediction
def predict_sentiment(review):
    review = clean_text(review)
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    sentiment = "Positive" if prediction >= 0.5 else "Negative"
    return sentiment

# Gradio interface
def gradio_interface(review):
    sentiment = predict_sentiment(review)
    return f"Sentiment: {sentiment}"


# Create Gradio interface
gr_interface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text", title="IMDb Review Sentiment Analysis")

gr_interface.launch()




It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8d7d2321dce0858c29.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [35]:
predict_sentiment("This movie was fantastic!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


'Positive'

In [36]:
predict_sentiment("This movie was terrible!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


'Negative'