# **Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
import nltk
plt.style.use('ggplot')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#**Importing Data from Kaggle**

In [2]:
# Installing the Kaggle API client
!pip install -q kaggle  # Install the Kaggle package quietly

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"abdelrahmangadallah","key":"6b38e1669db347c1e9531cc61edf6c4c"}'}

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Downloading a specific dataset (Amazon reviews) from Kaggle
!kaggle datasets download bittlingmayer/amazonreviews  # Download Amazon reviews dataset from Kaggle

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown


In [6]:
# Unzipping the Amazon reviews dataset ZIP file
! unzip amazonreviews.zip

Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


# **Data preparation**


### Train Data

In [7]:
# Reading a bzip2-compressed file as a pandas DataFrame
train_data = pd.read_csv('/content/train.ft.txt.bz2', compression='bz2', delimiter='\t', header=None)

# Displaying the first few rows of the DataFrame to inspect its structure
train_data.head()

Unnamed: 0,0
0,__label__2 Stuning even for the non-gamer: Thi...
1,__label__2 The best soundtrack ever to anythin...
2,__label__2 Amazing!: This soundtrack is my fav...
3,__label__2 Excellent Soundtrack: I truly like ...
4,"__label__2 Remember, Pull Your Jaw Off The Flo..."


In [8]:
# Taking a random sample of 400,000 rows from the DataFrame
train_sample = train_data.sample(n=250000, random_state=42)  # Sample 400,000 rows with a random seed for reproducibility

# Displaying the sample
train_sample.head()  # Show the first few rows of the random sample

Unnamed: 0,0
2079998,__label__1 Expensive Junk: This product consis...
1443106,__label__1 Toast too dark: Even on the lowest ...
3463669,__label__2 Excellent imagery...dumbed down sto...
2914699,__label__1 Are we pretending everyone is marri...
1603231,__label__1 Not worth your time: Might as well ...


### Test Data

In [9]:
# Reading a file compressed with bzip2 (.bz2) format as a pandas DataFrame
test_data = pd.read_csv('/content/test.ft.txt.bz2', compression='bz2', delimiter='\t', header=None)

# Displaying the first few rows of the DataFrame to inspect its structure
test_data.head()

Unnamed: 0,0
0,__label__2 Great CD: My lovely Pat has one of ...
1,__label__2 One of the best game music soundtra...
2,__label__1 Batteries died within a year ...: I...
3,"__label__2 works fine, but Maha Energy is bett..."
4,__label__2 Great for the non-audiophile: Revie...


In [10]:
# Taking a random sample of 100,000 rows from the DataFrame
test_sample = test_data.sample(n=100000, random_state=42)  # Sample 100,000 rows with a random seed for reproducibility

# Displaying the sample
test_sample.head()  # Show the first few rows of the random sample

Unnamed: 0,0
23218,__label__2 This is a great book: I must prefac...
20731,__label__1 Huge Disappointment.: As a big time...
39555,__label__2 Wayne is tight but cant hang with T...
147506,__label__2 Excellent: I read this book when I ...
314215,__label__1 Not about Anusara: Although this bo...


In [11]:
def data_view(file):                   # to  returning a new DataFrame with two columns: label and review
    data = []                                   # empty list for rows of data:label and the corresponding text
    for index, row in file.iterrows():          # iterrows() method for looping on rows(row index , current row)

        line = row[0]                           # first column of the row (ex=> __label__2)

        label, text = line.split(' ', 1)        # label:before first space,text:after the first space =>(first space in parameter 1 )

        label = label.replace('__label__', '')  # removes __label__& leaves num represent label(ex:__label__2 => 2)

        data.append((label, text.strip()))      # label & strip() to remove any space before or after the text

    cols = ['label', 'review']                  # column names(label & review for text)
    return pd.DataFrame(data, columns=cols)     # make data list in a pandas DataFrame

In [12]:
# Applying the data_view function to the sampled DataFrame
train = data_view(train_sample)  # Process the sampled DataFrame using the data_view function

# Displaying the first few rows of the processed DataFrame
train.head()  # Show the first few rows of the processed train DataFrame

Unnamed: 0,label,review
0,1,Expensive Junk: This product consists of a pie...
1,1,"Toast too dark: Even on the lowest setting, th..."
2,2,Excellent imagery...dumbed down story: I enjoy...
3,1,Are we pretending everyone is married?: The au...
4,1,Not worth your time: Might as well just use a ...


In [13]:
# Applying the data_view function to the second sampled DataFrame
test = data_view(test_sample)  # Process the second sampled DataFrame using the data_view function

# Displaying the first few rows of the processed DataFrame
test.head()  # Show the first few rows of the processed test DataFrame

Unnamed: 0,label,review
0,2,This is a great book: I must preface this by s...
1,1,"Huge Disappointment.: As a big time, long term..."
2,2,Wayne is tight but cant hang with Turk.: This ...
3,2,Excellent: I read this book when I was in elem...
4,1,Not about Anusara: Although this book is toute...


In [14]:
#replace label values to 0 for bad & 1 for good
train['label'] = train['label'].replace({"2":"1", "1":"0"})
test['label'] = test['label'].replace({"2":"1","1":"0"})

In [15]:
print(train.sample(5))
print("\n-*50\n")
print(test.sample(5))

       label                                             review
233191     1  The line and the circle: Karl Lowith looks to ...
45017      1  One of the best book series since LOTR: If you...
72182      1  It's the perfect gift!: I received the Twiligh...
172803     0  Great game but don't buy until DRM is fixed: G...
121763     0  Surprised: I have always purchased verbatim 4....

-*50

      label                                             review
22087     1  Great: Great book for young kids to learn abou...
47620     1  Karyn Parsons And Damon Wayans Make An Awsome ...
19549     0  Easy read and predicable: If you like to coast...
57838     1  Authority: I believe that the author's purpose...
82173     0  What's the point?: The notes are there, but ev...


In [16]:
train.shape

(250000, 2)

In [17]:
test.shape

(100000, 2)

In [18]:
train.isnull().sum()

Unnamed: 0,0
label,0
review,0


In [19]:
test.isnull().sum()

Unnamed: 0,0
label,0
review,0


In [20]:
train.duplicated().sum()

np.int64(0)

In [21]:
test.duplicated().sum()

np.int64(0)

# **Data preprocessing**

In [22]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# تحميل الموارد المطلوبة من nltk (لو أول مرة تستخدمها)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    """
    Preprocesses the input text by converting to lowercase, removing punctuation and digits,
    splitting into words, removing stop words, and applying lemmatization.

    Args:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """
    # Convert the text to lowercase
    text = text.lower()

    # Remove punctuation (لكن هنا خلينا نخلي 'dont' و 'didnt' وما شابهها من غير تغيير)
    text = re.sub(r'[^\w\s]', '', text).strip()   # Remove punctuation
    text = re.sub(r'\d+', '', text).strip()       # Remove digits

    # Split the text into words
    words = text.split()

    # الكلمات اللي مينفعش نشيلها لأنها بتأثر في المعنى
    important_words = {
        'not', 'no', 'nor', 'never', 'dont', 'didnt', 'wont', 'cant', 'couldnt',
        'shouldnt', 'wouldnt', 'isnt', 'arent', 'wasnt', 'werent', 'doesnt',
        'hasnt', 'havent', 'hadnt', 'very', 'really', 'just', 'only', 'always',
        'almost', 'too', 'enough', 'so'
    }

    # نطرحها من مجموعة الـ stopwords
    stop_words = set(stopwords.words('english')) - important_words

    # Remove both general stop words and false-meaning words
    words = [word for word in words if word not in stop_words]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [23]:
# Cleaning the text by applying the preprocess_text function to each review in the 'review' column
train['review'] = train['review'].apply(preprocess_text)  # Clean the text in the train DataFrame
train.head()

Unnamed: 0,label,review
0,0,expensive junk product consists piece thin fle...
1,0,toast too dark even lowest setting toast too d...
2,1,excellent imagerydumbed story enjoyed disc vid...
3,0,pretending everyone married author pretend par...
4,0,not worth time might well just use knife produ...


In [24]:
test['review'] = test['review'].apply(preprocess_text)    # Clean the text in the test DataFrame
test.head()

Unnamed: 0,label,review
0,1,great book must preface saying not religious l...
1,0,huge disappointment big time long term trevani...
2,1,wayne tight cant hang turk album hot want howe...
3,1,excellent read book elementary school probably...
4,0,not anusara although book touted several anusa...


In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Setting parameters for tokenization
max_words = 10000  # Maximum number of words to consider in the tokenizer
max_len = 100      # Maximum length of sequences

# Initializing and fitting the tokenizer on the training data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")    #Limit the vocabulary size (the number of words)
tokenizer.fit_on_texts(train['review'])        # Fit tokenizer on the reduced training data

# Converting the cleaned texts into numerical sequences
X_train = tokenizer.texts_to_sequences(train['review'])  # Texts to numerical sequences for training
X_test = tokenizer.texts_to_sequences(test['review'])    # Texts to numerical sequences for testing

# Applying padding to the sequences to ensure they all have the same length
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')  # Padding for training sequences
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')    # Padding for testing sequences

# Converting labels to numpy arrays
y_train = train['label'].values  # Training labels
y_test = test['label'].values    # Testing labels

# **Splitting Data**


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [27]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)
y_val = y_val.astype(int)

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
# Saving the tokenizer to a file using Pickle
import pickle

# Open a file in write-binary mode to save the tokenizer
with open('/content/drive/My Drive/Colab Notebooks/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)  # Save the tokenizer object to the file

print("Tokenizer saved successfully!")  # Confirmation message after saving

Tokenizer saved successfully!


# **Building Models**





In [33]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Bidirectional


# Defining the configuration
vocab_size = 10000  # Maximum number of words in the tokenizer
embedding_dim = 64  # Dimension of word embeddings
max_len = 100  # Maximum length of input sequences

# EarlyStopping and ModelCheckpoint for all models
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

# 1. LSTM Model

In [None]:
# Building the LSTM model
model_lstm = Sequential([
    Input(shape=(max_len,), dtype='int32'),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=True, dropout=0.3),
    BatchNormalization(),
    LSTM(64, return_sequences=False, dropout=0.3),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

# Display the LSTM model summary
print("LSTM Model Summary:")
model_lstm.summary()

# Compiling the LSTM model
model_lstm.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Model checkpoint
save_best_model = ModelCheckpoint(
    filepath='/content/drive/MyDrive/Colab Notebooks/lstm_model.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Train the LSTM model
history_lstm = model_lstm.fit(X_train, y_train, batch_size=32, epochs=15, validation_data=(X_val, y_val), callbacks=[early_stopping, save_best_model])

LSTM Model Summary:


Epoch 1/15
[1m5469/5469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step - accuracy: 0.5020 - loss: 0.7463
Epoch 1: val_loss improved from inf to 0.69106, saving model to /content/drive/MyDrive/Colab Notebooks/lstm_model.keras
[1m5469/5469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1450s[0m 264ms/step - accuracy: 0.5020 - loss: 0.7463 - val_accuracy: 0.5206 - val_loss: 0.6911
Epoch 2/15
[1m3181/5469[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m8:58[0m 235ms/step - accuracy: 0.6187 - loss: 0.5961

# 2.GRU Model

In [None]:
# Building the GRU model
model_gru = Sequential([
    Input(shape=(max_len,), dtype='int32'),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    GRU(128, return_sequences=True, dropout=0.3),
    BatchNormalization(),
    GRU(64, return_sequences=False, dropout=0.3),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

# Display the GRU model summary
print("GRU Model Summary:")
model_gru.summary()

# Compiling the GRU model
model_gru.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Model checkpoint
save_best_model = ModelCheckpoint(
    filepath='/content/drive/MyDrive/Colab Notebooks/gru_model.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Train the GRU model
history_gru = model_gru.fit(X_train, y_train, batch_size=32, epochs=15, validation_data=(X_val, y_val), callbacks=[early_stopping, save_best_model])

#3.CNN Model

In [None]:
# Building the CNN model
model_cnn = Sequential([
    Input(shape=(max_len,), dtype='int32'),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

# Display the CNN model summary
print("CNN Model Summary:")
model_cnn.summary()

# Compiling the CNN model
model_cnn.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Model checkpoint
save_best_model = ModelCheckpoint(
    filepath='/content/drive/MyDrive/Colab Notebooks/cnn_model.keras',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

# Train the CNN model
history_cnn = model_cnn.fit(X_train, y_train, batch_size=32, epochs=15, validation_data=(X_val, y_val), callbacks=[early_stopping, save_best_model])

## Evaluate The Models

In [None]:
from sklearn.metrics import classification_report

# Evaluate all models on the test set
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test, y_test)
test_loss_gru, test_accuracy_gru = model_gru.evaluate(X_test, y_test)
test_loss_cnn, test_accuracy_cnn = model_cnn.evaluate(X_test, y_test)

# Print loss and accuracy
print(f"LSTM Test Loss: {test_loss_lstm * 100:.2f}%, Test Accuracy: {test_accuracy_lstm * 100:.2f}%")
print(f"GRU Test Loss: {test_loss_gru * 100:.2f}%, Test Accuracy: {test_accuracy_gru * 100:.2f}%")
print(f"CNN Test Loss: {test_loss_cnn * 100:.2f}%, Test Accuracy: {test_accuracy_cnn * 100:.2f}%")

# Predictions on the test set
y_pred_lstm = (model_lstm.predict(X_test) > 0.5).astype(int)
y_pred_gru = (model_gru.predict(X_test) > 0.5).astype(int)
y_pred_cnn = (model_cnn.predict(X_test) > 0.5).astype(int)

# Classification reports
print("LSTM Classification Report:")
print(classification_report(y_test, y_pred_lstm))

print("GRU Classification Report:")
print(classification_report(y_test, y_pred_gru))

print("CNN Classification Report:")
print(classification_report(y_test, y_pred_cnn))

# Accuracy plot for every model

In [None]:
import matplotlib.pyplot as plt

# Plot Accuracy for LSTM Model
plt.figure(figsize=(8, 5))
plt.plot(history_lstm.history['accuracy'], label='LSTM Training Accuracy', marker='o')
plt.plot(history_lstm.history['val_accuracy'], label='LSTM Validation Accuracy', marker='o')
plt.title('LSTM Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

# Plot Accuracy for GRU Model
plt.figure(figsize=(8, 5))
plt.plot(history_gru.history['accuracy'], label='GRU Training Accuracy', marker='o')
plt.plot(history_gru.history['val_accuracy'], label='GRU Validation Accuracy', marker='o')
plt.title('GRU Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

# Plot Accuracy for CNN Model
plt.figure(figsize=(8, 5))
plt.plot(history_cnn.history['accuracy'], label='CNN Training Accuracy', marker='o')
plt.plot(history_cnn.history['val_accuracy'], label='CNN Validation Accuracy', marker='o')
plt.title('CNN Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.show()

## Apply 3 models on Real Examples

In [None]:
# Define a function to preprocess the text
def preprocess_text(text, tokenizer, max_len=200):
    sequences = tokenizer.texts_to_sequences([text])       # Convert text to sequences
    padded = pad_sequences(sequences, maxlen=max_len)    # Pad the sequences to ensure consistent input length
    return padded

# Example real text data
real_examples = [
    "this is an amazing product i prefer buying it nowaday!",
    "product would not recommend to anyone and waste my money",
    "The tall design is completely impractical and doesn’t fit anywhere in my home. Very disappointed",
    "This product is way too tall and unstable. It feels like it could fall over at any moment",
    "The tall design looked good in pictures but feels cheaply made in person. Not worth the price.",
    "It’s too tall to be practical. It blocks views and doesn’t function as intended.",
    "This is by far the best tall product I’ve ever owned. It’s elegant, reliable, and exactly as described!",
    "I’ve been searching for a tall product like this for ages. It’s exactly what I needed, and the quality is excellent.",
    "This product exceeded all my expectations! Its tall design is sleek and modern, making it a perfect addition to my home.",
]

# Preprocess the examples
preprocessed_examples = [preprocess_text(text, tokenizer) for text in real_examples]

# Predict using all three models
for i, example in enumerate(real_examples):
    preprocessed = preprocess_text(example, tokenizer)

    # LSTM Prediction
    lstm_pred = model_lstm.predict(preprocessed)[0][0]
    lstm_pred_label = 'Positive' if lstm_pred > 0.5 else 'Negative'

    # GRU Prediction
    gru_pred = model_gru.predict(preprocessed)[0][0]
    gru_pred_label = 'Positive' if gru_pred > 0.5 else 'Negative'

    # CNN Prediction
    cnn_pred = model_cnn.predict(preprocessed)[0][0]
    cnn_pred_label = 'Positive' if cnn_pred > 0.5 else 'Negative'

    # Printing the results
    print(f"Example: {example}")
    print(f"LSTM Prediction: {lstm_pred_label} (Probability: {lstm_pred:.4f})")
    print(f"GRU Prediction: {gru_pred_label} (Probability: {gru_pred:.4f})")
    print(f"CNN Prediction: {cnn_pred_label} (Probability: {cnn_pred:.4f})")
    print()

# Save 3 models

In [None]:
from tensorflow.keras.models import load_model

# Load the pre-trained models
model_lstm = load_model("/content/drive/MyDrive/Colab Notebooks/lstm_model.keras")
model_gru = load_model("/content/drive/MyDrive/Colab Notebooks/gru_model.keras")
model_cnn = load_model("/content/drive/MyDrive/Colab Notebooks/cnn_model.keras")

# **Interface using Streamlit**

In [None]:

!pip install streamlit tensorflow nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil

# Source path of the image on the local device after downloading to Google Drive
source_path = "/content/img.jpg"  # Make sure this path is correct

# Destination path inside Google Drive in the 'Colab Notebooks' folder
destination_path = "/content/drive/MyDrive/Colab Notebooks/img.jpg"

# Copy the image to the new location
shutil.copy(source_path, destination_path)

print("The image has been successfully saved to Google Drive!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import streamlit as st
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the pre-trained models
model_lstm = load_model("/content/drive/MyDrive/Colab Notebooks/lstm_model.keras")
model_gru = load_model("/content/drive/MyDrive/Colab Notebooks/gru_model.keras")
model_cnn = load_model("/content/drive/MyDrive/Colab Notebooks/cnn_model.keras")

# Load the tokenizer
with open('/content/drive/MyDrive/Colab Notebooks/tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

# Evaluate all models on the test set and compare accuracy
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test, y_test)
test_loss_gru, test_accuracy_gru = model_gru.evaluate(X_test, y_test)
test_loss_cnn, test_accuracy_cnn = model_cnn.evaluate(X_test, y_test)

# Choose the best model based on accuracy
best_model = None
if test_accuracy_lstm > test_accuracy_gru and test_accuracy_lstm > test_accuracy_cnn:
    best_model = model_lstm
    best_model_name = 'LSTM'
elif test_accuracy_gru > test_accuracy_lstm and test_accuracy_gru > test_accuracy_cnn:
    best_model = model_gru
    best_model_name = 'GRU'
else:
    best_model = model_cnn
    best_model_name = 'CNN'

# Set Streamlit page configuration
st.set_page_config(page_title="Sentiment Analysis", page_icon="💬", layout="wide")

# Default tab to "Home"
if "current_tab" not in st.session_state:
    st.session_state.current_tab = "Home"

# Sidebar navigation
with st.sidebar:
    st.title("Sentiment Analysis")
    st.title("📂")
    if st.button("🔍 Analyze Sentiment"):
        st.session_state.current_tab = "Analyze Sentiment"
    if st.button("📄 Documentation"):
        st.session_state.current_tab = "Documentation"

# Home Tab
if st.session_state.current_tab == "Home":
    st.image(r"/content/drive/MyDrive/Colab Notebooks/img.jpg", use_container_width=True)

    st.markdown("""
    #### Project Overview in Sentiment Analysis 💬:
    This application leverages Deep Learning techniques to analyze text reviews and classify them as (*positive|negative*) sentiments.
    It uses an LSTM (Long Short-Term Memory) model that has been trained on a large dataset of text reviews. The application allows users to input a text review which will then be analyzed and classified based on sentiment.

    #### Features:
    - *Text Classification*: Classifies input text as **positive** or **negative** sentiment.
    - *User-Friendly Interface*: Simple text box to input reviews and receive immediate feedback.
    - *Real-time Sentiment Analysis*: Provides feedback based on the model's confidence level.

    #### How It Works:
    - The model processes the input text by tokenizing and padding it to match the format used during training.
    - The processed text is then passed through the model to predict the sentiment.
    - The result is displayed with a confidence level indicating the certainty of the prediction.

    ### About the Model:
    - *Model Type*: {best_model_name} (Best Model)
    - *Training Data*: A large dataset of labeled text reviews.
    - *Purpose*: Binary sentiment classification: **positive** or **negative**.

    #### Requirements:
    - Input text should be in English for optimal performance.
    """)

# Analyze Sentiment Tab
elif st.session_state.current_tab == "Analyze Sentiment":
    nltk.download('stopwords')
    nltk.download('wordnet')

    # Model and tokenizer settings
    max_len = 100  # Adjust based on your model

    # Function to preprocess text
    def preprocess_text(text):
        """
        Preprocesses the input text by converting to lowercase, removing punctuation and digits,
        splitting into words, removing stop words, and applying lemmatization.

        Args:
            text (str): The input text to be preprocessed.

        Returns:
            str: The preprocessed text.
        """
        # Convert the text to lowercase
        text = text.lower()

        # Remove punctuation and digits
        text = re.sub(r'[^\w\s]', '', text).strip()
        text = re.sub(r'\d+', '', text).strip()

        # Split the text into words
        words = text.split()

        # Important words that shouldn't be removed
        important_words = {
            'not', 'no', 'nor', 'never', 'dont', 'didnt', 'wont', 'cant', 'couldnt',
            'shouldnt', 'wouldnt', 'isnt', 'arent', 'wasnt', 'werent', 'doesnt',
            'hasnt', 'havent', 'hadnt', 'very', 'really', 'just', 'only', 'always',
            'almost', 'too', 'enough', 'so'
        }

        # Subtract them from stopwords
        stop_words = set(stopwords.words('english')) - important_words

        # Remove stopwords
        words = [word for word in words if word not in stop_words]

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return ' '.join(words)

    # Function to predict sentiment
    def predict_sentiment(text):
        cleaned = preprocess_text(text)
        seq = tokenizer.texts_to_sequences([cleaned])
        padded = pad_sequences(seq, maxlen=max_len, padding='post')
        prediction = best_model.predict(padded)
        sentiment = "😊 Positive" if prediction[0][0] > 0.5 else "😞 Negative"
        return sentiment

    # Streamlit Interface
    st.title("🔍 Analyze Sentiment")
    st.write("Use the input box below to analyze the sentiment of a text review.")

    input_text = st.text_area("Enter your review:")

    if st.button("Analyze Sentiment"):
        if input_text.strip() == "":
            st.warning("⚠ Please enter some text to analyze.")
        else:
            sentiment, confidence = predict_sentiment(input_text)
            st.subheader("Analysis Result")
            st.markdown(f"""
            *Sentiment:* {sentiment}
            *Confidence:* {confidence:.2f}
            """)

    if st.button("Back to Home"):
        st.session_state.current_tab = "Home"

# Documentation Tab
elif st.session_state.current_tab == "Documentation":
    st.title("📄 Documentation")
    st.write("""
    ## Dataset URL:
    *Amazon Reviews dataset: Amazon Reviews Dataset*

    The Amazon Reviews dataset contains **3.6 million** reviews with binary sentiment classes: *positive* or *negative*.

    ## Project Goals:
    - *Perform EDA* to uncover insights from the text data.
    - *Preprocess the dataset*, including splitting it into training and testing sets, and transforming the text data into input vectors using techniques such as tokenization or word embeddings.
    - *Develop a model* using PyTorch or Tensorflow to perform sentiment analysis on the input data.
    - *Experiment with different neural network architectures*, activation functions, and learning rates to optimize the model.
    - *Evaluate the performance* of the model using metrics such as accuracy, precision, recall, and F1 score.
    - *Visualize the results* using confusion matrices or other visualization techniques to assess model performance.

    ## Steps:

    ### Data Exploration:
    - Start by downloading the chosen dataset. Familiarize yourself with its structure and content, looking at the distribution of sentiment classes and identifying any anomalies.

    ### Data Preprocessing:
    - Split the dataset into training and testing sets to ensure a fair evaluation.
    - Transform the text data into input vectors using techniques such as tokenization or word embeddings.

    ### Model Building:
    - Use PyTorch or Tensorflow to construct a model designed for sentiment analysis.
    - Experiment with different neural network architectures, activation functions, and learning rates to optimize performance.

    ### Model Evaluation:
    - Employ metrics such as accuracy, precision, recall, and F1 score to gauge the model's effectiveness.
    - Use visualization tools like confusion matrices to better understand the model’s performance across different sentiment classes.

    ### Experimentation:
    - Iterate on model design, testing various configurations and parameters to enhance accuracy and reliability. Document the impacts of different adjustments on model performance.

    ### Results Analysis:
    - Analyze the final model’s performance, focusing on its ability to accurately classify different sentiment classes. Discuss the implications of the model's accuracy and areas for potential improvement.
    """)

    if st.button("Back to Home"):
        st.session_state.current_tab = "Home"


# **Running Streamlit**

In [None]:
# For retrieving my public IP address

! wget -q -O - ipv4.icanhazip.com

In [None]:
# To start a local server on a random port
! streamlit run app.py & npx localtunnel --port 8501