<a href="https://colab.research.google.com/github/bkhajidmaa7-art/IMDB_Sentiment_Analysis.ipynb-/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz


--2025-12-14 18:57:01--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.2’


2025-12-14 18:57:03 (45.7 MB/s) - ‘aclImdb_v1.tar.gz.2’ saved [84125825/84125825]



In [2]:
!pip install gensim transformers nltk tqdm scikit-learn




In [3]:
import os, re, numpy as np, nltk, torch
from tqdm import tqdm


In [4]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def load_imdb(path):
    texts, labels = [], []
    for label_type in ["pos", "neg"]:
        folder = os.path.join(path, label_type)
        label = 1 if label_type == "pos" else 0
        for file in os.listdir(folder):
            with open(os.path.join(folder, file), encoding="utf-8") as f:
                texts.append(f.read())
                labels.append(label)
    return texts, labels


In [6]:
train_texts, train_labels = load_imdb("aclImdb/train")
test_texts, test_labels   = load_imdb("aclImdb/test")


In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)


In [8]:
train_clean = [preprocess(t) for t in tqdm(train_texts)]
test_clean  = [preprocess(t) for t in tqdm(test_texts)]


100%|██████████| 25000/25000 [00:01<00:00, 16053.01it/s]
100%|██████████| 25000/25000 [00:02<00:00, 8924.83it/s]


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(train_clean)
X_test_tfidf  = tfidf.transform(test_clean)


In [10]:
from gensim.models import Word2Vec

tokenized = [t.split() for t in train_clean]

w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=2
)


In [11]:
def sentence_vector(sentence):
    words = sentence.split()
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)


In [12]:
X_train_w2v = np.array([sentence_vector(s) for s in train_clean])
X_test_w2v  = np.array([sentence_vector(s) for s in test_clean])


In [13]:
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)
bert.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [14]:
def bert_tokenizer(tokenizer, data, max_length=128):
    return tokenizer(
        data,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )


In [15]:
def word_embed(device, tokenizer, bert, column, batch_size=8):
    loader = DataLoader(column, batch_size=batch_size, shuffle=False)
    outputs = []

    for batch in tqdm(loader, desc="BERT"):
        tokens = bert_tokenizer(tokenizer, batch)
        tokens = {k:v.to(device) for k,v in tokens.items()}

        with torch.no_grad():
            out = bert(**tokens)

        cls = out.last_hidden_state[:,0,:]
        outputs.append(cls.cpu())

    return torch.cat(outputs, dim=0)


In [16]:
# 2500 positive + 2500 negative sample
train_pos_idx = [i for i, y in enumerate(train_labels) if y == 1][:2500]
train_neg_idx = [i for i, y in enumerate(train_labels) if y == 0][:2500]
train_idx = train_pos_idx + train_neg_idx

X_train_texts = [train_texts[i] for i in train_idx]
y_train_bert   = [train_labels[i] for i in train_idx]

# 1000 positive + 1000 negative sample
test_pos_idx = [i for i, y in enumerate(test_labels) if y == 1][:1000]
test_neg_idx = [i for i, y in enumerate(test_labels) if y == 0][:1000]
test_idx = test_pos_idx + test_neg_idx

X_test_texts = [test_texts[i] for i in test_idx]
y_test_bert   = [test_labels[i] for i in test_idx]


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [18]:
models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100)
}


In [19]:
X_train_bert = word_embed(device, tokenizer, bert, X_train_texts, batch_size=8)
X_test_bert  = word_embed(device, tokenizer, bert, X_test_texts, batch_size=8)


BERT: 100%|██████████| 625/625 [00:54<00:00, 11.47it/s]
BERT: 100%|██████████| 250/250 [00:19<00:00, 12.77it/s]


In [20]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

def evaluate(X_train, X_test, y_train, y_test):
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
        "AdaBoost": AdaBoostClassifier(n_estimators=200, random_state=42)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results[name] = {"Accuracy": acc, "F1": f1}
        print(f"{name}: Acc={acc:.4f}, F1={f1:.4f}")

    return results


In [21]:
evaluate(X_train_bert, X_test_bert, y_train_bert, y_test_bert)


LogisticRegression: Acc=0.7995, F1=0.7947
RandomForest: Acc=0.7525, F1=0.7524
AdaBoost: Acc=0.7570, F1=0.7515


{'LogisticRegression': {'Accuracy': 0.7995, 'F1': 0.7946748591909882},
 'RandomForest': {'Accuracy': 0.7525, 'F1': 0.752376188094047},
 'AdaBoost': {'Accuracy': 0.757, 'F1': 0.7515337423312883}}