<a href="https://colab.research.google.com/github/bkhajidmaa7-art/IMDB_Sentiment_Analysis.ipynb-/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz


--2025-12-14 06:44:53--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-12-14 06:44:57 (22.8 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
!pip install gensim transformers nltk tqdm scikit-learn


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [4]:
import os, re, numpy as np, nltk, torch
from tqdm import tqdm


In [5]:
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
def load_imdb(path):
    texts, labels = [], []
    for label_type in ["pos", "neg"]:
        folder = os.path.join(path, label_type)
        label = 1 if label_type == "pos" else 0
        for file in os.listdir(folder):
            with open(os.path.join(folder, file), encoding="utf-8") as f:
                texts.append(f.read())
                labels.append(label)
    return texts, labels


In [7]:
train_texts, train_labels = load_imdb("aclImdb/train")
test_texts, test_labels   = load_imdb("aclImdb/test")


In [8]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)


In [9]:
train_clean = [preprocess(t) for t in tqdm(train_texts)]
test_clean  = [preprocess(t) for t in tqdm(test_texts)]


100%|██████████| 25000/25000 [00:02<00:00, 8788.55it/s]
100%|██████████| 25000/25000 [00:03<00:00, 6510.68it/s]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(train_clean)
X_test_tfidf  = tfidf.transform(test_clean)


In [11]:
from gensim.models import Word2Vec

tokenized = [t.split() for t in train_clean]

w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=2
)


In [12]:
def sentence_vector(sentence):
    words = sentence.split()
    vecs = [w2v.wv[w] for w in words if w in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)


In [13]:
X_train_w2v = np.array([sentence_vector(s) for s in train_clean])
X_test_w2v  = np.array([sentence_vector(s) for s in test_clean])


In [14]:
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased").to(device)
bert.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [15]:
def bert_tokenizer(tokenizer, data, max_length=128):
    return tokenizer(
        data,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )


In [16]:
def word_embed(device, tokenizer, bert, column, batch_size=8):
    loader = DataLoader(column, batch_size=batch_size, shuffle=False)
    outputs = []

    for batch in tqdm(loader, desc="BERT"):
        tokens = bert_tokenizer(tokenizer, batch)
        tokens = {k:v.to(device) for k,v in tokens.items()}

        with torch.no_grad():
            out = bert(**tokens)

        cls = out.last_hidden_state[:,0,:]
        outputs.append(cls.cpu())

    return torch.cat(outputs, dim=0)


In [17]:
# 2500 positive + 2500 negative sample
train_pos_idx = [i for i, y in enumerate(train_labels) if y == 1][:2500]
train_neg_idx = [i for i, y in enumerate(train_labels) if y == 0][:2500]
train_idx = train_pos_idx + train_neg_idx

X_train_texts = [train_texts[i] for i in train_idx]
y_train_bert   = [train_labels[i] for i in train_idx]

# 1000 positive + 1000 negative sample
test_pos_idx = [i for i, y in enumerate(test_labels) if y == 1][:1000]
test_neg_idx = [i for i, y in enumerate(test_labels) if y == 0][:1000]
test_idx = test_pos_idx + test_neg_idx

X_test_texts = [test_texts[i] for i in test_idx]
y_test_bert   = [test_labels[i] for i in test_idx]


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [19]:
models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "AdaBoost": AdaBoostClassifier(),
    "RandomForest": RandomForestClassifier(n_estimators=100)
}


In [24]:
X_train_bert = word_embed(device, tokenizer, bert, X_train_texts, batch_size=8)
X_test_bert  = word_embed(device, tokenizer, bert, X_test_texts, batch_size=8)


BERT: 100%|██████████| 625/625 [00:49<00:00, 12.57it/s]
BERT: 100%|██████████| 250/250 [00:21<00:00, 11.62it/s]


In [25]:
def evaluate(X_train, X_test, y_train, y_test):
    for name, model in models.items():
        print(f"  {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy  = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, y_pred, average="binary",
            zero_division=0
        )
        print(f"    Accuracy:  {accuracy:.4f}")
        print(f"    Precision: {precision:.4f}")
        print(f"    Recall:    {recall:.4f}")
        print(f"    F1-Score:  {f1:.4f}")


print("BERT")
evaluate(X_train_bert, X_test_bert, y_train_bert, y_test_bert)

BERT
  Logistic
    Accuracy:  0.7995
    Precision: 0.8143
    Recall:    0.7760
    F1-Score:  0.7947
  AdaBoost
    Accuracy:  0.7500
    Precision: 0.7446
    Recall:    0.7610
    F1-Score:  0.7527
  RandomForest
    Accuracy:  0.7350
    Precision: 0.7331
    Recall:    0.7390
    F1-Score:  0.7361


In [26]:
print("TF-IDF")
evaluate(X_train_tfidf, X_test_tfidf, train_labels, test_labels)

print("\nWord2Vec")
evaluate(X_train_w2v, X_test_w2v, train_labels, test_labels)

print("\nBERT")
evaluate(X_train_bert, X_test_bert, y_train_bert, y_test_bert)


TF-IDF
  Logistic
    Accuracy:  0.8794
    Precision: 0.8756
    Recall:    0.8844
    F1-Score:  0.8800
  AdaBoost
    Accuracy:  0.7522
    Precision: 0.7364
    Recall:    0.7856
    F1-Score:  0.7602
  RandomForest
    Accuracy:  0.8426
    Precision: 0.8526
    Recall:    0.8283
    F1-Score:  0.8403

Word2Vec
  Logistic
    Accuracy:  0.8042
    Precision: 0.8010
    Recall:    0.8094
    F1-Score:  0.8052
  AdaBoost
    Accuracy:  0.7539
    Precision: 0.7621
    Recall:    0.7382
    F1-Score:  0.7500
  RandomForest
    Accuracy:  0.7657
    Precision: 0.7656
    Recall:    0.7658
    F1-Score:  0.7657

BERT
  Logistic
    Accuracy:  0.7995
    Precision: 0.8143
    Recall:    0.7760
    F1-Score:  0.7947
  AdaBoost
    Accuracy:  0.7500
    Precision: 0.7446
    Recall:    0.7610
    F1-Score:  0.7527
  RandomForest
    Accuracy:  0.7410
    Precision: 0.7405
    Recall:    0.7420
    F1-Score:  0.7413
