In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil

dataset_path = "/content/drive/MyDrive/Colab Notebooks/aclImdb_v1.tar.gz"
shutil.unpack_archive(dataset_path, "./")

In [None]:
def load_imdb_data(directory):
    texts, labels = [], []

    for label_type in ["pos", "neg"]:
        folder_path = os.path.join(directory, label_type)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), encoding="utf-8") as f:
                texts.append(f.read())
            labels.append(1 if label_type == "pos" else 0)

    return texts, labels

# Load training data
train_texts, train_labels = load_imdb_data("./aclImdb/train")

# Load testing data
test_texts, test_labels = load_imdb_data("./aclImdb/test")

# Merge train and test datasets
all_texts = train_texts + test_texts
all_labels = train_labels + test_labels

# Convert to a DataFrame for easy handling
df = pd.DataFrame({"review": all_texts, "label": all_labels})

# Shuffle the data (important for training)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
import re
import string

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["review"] = df["review"].apply(clean_text)

In [None]:
len(df)

50000

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
def tokenize_text(text):
    doc = nlp(text)  # Process text using spaCy
    tokens = [token.lemma_ for token in doc]  # Lemmatization & remove stopwords
    return tokens

In [None]:
from tqdm import tqdm

# Enable tqdm for Pandas apply
tqdm.pandas()

# Apply tokenization with progress tracking
df["review"] = df["review"].progress_apply(tokenize_text)


100%|██████████| 50000/50000 [31:53<00:00, 26.13it/s]


In [None]:
df.head(10)

Unnamed: 0,review,label
0,"[I, have, before, a, feeling, of, mislike, for...",1
1,"["", der, todesking""-jorg, buttgereit, 's, seco...",1
2,"[the, third, muppet, movie, be, perhaps, the, ...",1
3,"[this, be, one, of, those, little, christmas, ...",1
4,"[*, *, *, may, contain, spoiler, *, *, *, *, I...",0
5,"[this, be, pure, crap, ,, and, probably, the, ...",0
6,"[I, really, enjoy, this, drama, from, sidney, ...",1
7,"[so, we, be, suppose, to, find, it, funny, tha...",0
8,"[my, dog, recently, pass, away, ,, and, this, ...",1
9,"[of, all, the, 48, film, of, brigitte, bardot,...",1


In [None]:
df.to_csv("tokenized_imdb_reviews2.csv", index=False)

In [1]:
import pandas as pd
df = pd.read_csv("tokenized_imdb_reviews2.csv")

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Define max vocabulary size and sequence length
MAX_VOCAB_SIZE = 7500  # Keep top 7500 words
MAX_SEQUENCE_LENGTH = 120  # Max review length

# Initialize and fit tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["review"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["review"])

# Pad sequences to make them uniform
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Convert labels to numpy array
labels = np.array(df["label"])

# Save the tokenizer for later use
import pickle
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)

# Save processed data
np.save("padded_sequences.npy", padded_sequences)
np.save("labels.npy", labels)

print("✅ Text tokenized & padded. Data saved for later use.")

✅ Text tokenized & padded. Data saved for later use.


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

# Load the saved tokenized and padded sequences
padded_sequences = np.load("padded_sequences.npy")
labels = np.load("labels.npy")

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

Training samples: 40000
Testing samples: 10000


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Model hyperparameters
VOCAB_SIZE = 20000  # Same as MAX_VOCAB_SIZE
EMBEDDING_DIM = 100  # Word embedding dimension

# Define the improved model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),

    # First RNN layer with Dropout
    SimpleRNN(200, return_sequences=True, activation="tanh"),
    Dropout(0.35),

    # Second RNN layer
    SimpleRNN(130, activation="tanh"),
    Dropout(0.25),

    # Fully connected output layer
    Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")



Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 158ms/step - accuracy: 0.4975 - loss: 0.7490 - val_accuracy: 0.5130 - val_loss: 0.7015
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 168ms/step - accuracy: 0.5088 - loss: 0.7017 - val_accuracy: 0.5061 - val_loss: 0.6937
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m258s[0m 165ms/step - accuracy: 0.5039 - loss: 0.6963 - val_accuracy: 0.5078 - val_loss: 0.6953
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 165ms/step - accuracy: 0.5069 - loss: 0.6951 - val_accuracy: 0.5150 - val_loss: 0.6928
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 164ms/step - accuracy: 0.5073 - loss: 0.6951 - val_accuracy: 0.5151 - val_loss: 0.6932
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 42ms/step - accuracy: 0.5124 - loss: 0.6935
Test Accuracy: 0.5151


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),

    # First LSTM layer with Dropout
    LSTM(64, return_sequences=True, activation="tanh"),
    Dropout(0.2),

    # Second LSTM layer
    LSTM(32, activation="tanh"),
    Dropout(0.2),

    # Fully connected output layer
    Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 252ms/step - accuracy: 0.6282 - loss: 0.6292 - val_accuracy: 0.5860 - val_loss: 0.6884
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 236ms/step - accuracy: 0.6136 - loss: 0.6465 - val_accuracy: 0.6522 - val_loss: 0.6744
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 235ms/step - accuracy: 0.5799 - loss: 0.6672 - val_accuracy: 0.7559 - val_loss: 0.5331
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 237ms/step - accuracy: 0.8183 - loss: 0.4185 - val_accuracy: 0.8239 - val_loss: 0.3965
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 250ms/step - accuracy: 0.8645 - loss: 0.3253 - val_accuracy: 0.8179 - val_loss: 0.3889
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 40ms/step - accuracy: 0.8201 - loss: 0.3879
Test Accuracy: 0.8179


In [None]:
model.save("sentiment_lstm_model.h5")
model.save('sentiment_lstm_modelNATIVE.keras')
print("✅ Model saved successfully!")



✅ Model saved successfully!


In [5]:
import tensorflow as tf
model = tf.keras.models.load_model("sentiment_lstm_modelNATIVE.keras")

  saveable.load_own_variables(weights_store.get(inner_path))


In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

class AttentionLayer(Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def build(self, input_shape):
        """
        Define the trainable parameters for the attention layer.
        """
        self.W = self.add_weight(shape=(input_shape[-1], 1), initializer="random_normal", trainable=True)
        self.b = self.add_weight(shape=(1,), initializer="zeros", trainable=True)

    def call(self, inputs):
        """
        Apply attention mechanism.
        """
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)  # Compute attention scores
        attention_weights = tf.nn.softmax(score, axis=1)  # Convert scores to probabilities
        context_vector = attention_weights * inputs  # Apply attention weights
        context_vector = tf.reduce_sum(context_vector, axis=1)  # Sum over time steps
        return context_vector

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout


# Define input
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))

# Word Embeddings
x = Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128)(inputs)

# LSTM Layers
x = LSTM(64, return_sequences=True, activation="tanh")(x)
x = Dropout(0.2)(x)

x = LSTM(32, return_sequences=True, activation="tanh")(x)
x = Dropout(0.2)(x)

# Apply Attention Mechanism
x = AttentionLayer()(x)

# Fully connected layers
x = Dense(32, activation="relu")(x)
x = Dropout(0.2)(x)
outputs = Dense(1, activation="sigmoid")(x)

# Create the model
model = Model(inputs, outputs)

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - accuracy: 0.6930 - loss: 0.5489 - val_accuracy: 0.8224 - val_loss: 0.3907
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 15ms/step - accuracy: 0.8709 - loss: 0.3138 - val_accuracy: 0.8384 - val_loss: 0.3571
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.8971 - loss: 0.2583 - val_accuracy: 0.8374 - val_loss: 0.3693
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step - accuracy: 0.9164 - loss: 0.2139 - val_accuracy: 0.8293 - val_loss: 0.4193
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9352 - loss: 0.1736 - val_accuracy: 0.8238 - val_loss: 0.4570
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8271 - loss: 0.4580
Test Accuracy: 0.8238


In [None]:
from transformers import DistilBertTokenizerFast
import torch
from sklearn.model_selection import train_test_split
import numpy as np

# Load dataset
padded_sequences = np.load("padded_sequences.npy")
labels = np.load("labels.npy")

# Convert labels to PyTorch tensors
labels = torch.tensor(labels)

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize text
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=200, return_tensors="pt")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)

# Convert to PyTorch tensors
train_encodings = torch.tensor(X_train)
test_encodings = torch.tensor(X_test)

print("✅ Data tokenized & split for Transformer model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

✅ Data tokenized & split for Transformer model


In [None]:
from transformers import DistilBertForSequenceClassification

# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model loaded and moved to", device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded and moved to cpu
