<a href="https://colab.research.google.com/github/arfinmahmudshiblu/BOW-TFIDF-Sentiment-Analysis/blob/main/BOW_TFIDF_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
lakshmi25npathi_imdb_dataset_of_50k_movie_reviews_path = kagglehub.dataset_download('lakshmi25npathi/imdb-dataset-of-50k-movie-reviews')

print('Data source import complete.')


Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Data source import complete.


## Text Preprocessing using Bag of Words, TF-IDF, Stemming, Lemmatization, and Stopword Removal

In [2]:
!pip install nltk



In [3]:
corpus = """
Professor Muhammad Yunus was born on June 28, 1940. He is the founder and managing director of Grameen Bank, which pioneered microcredit. This is a
method of banking where small loans are given to the poor, mostly to women, without collateral, for income-generating activities, to help them get out
of poverty.

The third of nine children, Prof Yunus was born in the village of Bathua, Chittagong. His father was Haji Muhammad Dula Mia Shawdagar, a jeweller, and
his mother was Sofia Khatun.

"""

In [4]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## # Tokenizer

In [5]:
#Sentence Tokenizer
sentences = nltk.sent_tokenize(corpus)
sentences

['\nProfessor Muhammad Yunus was born on June 28, 1940.',
 'He is the founder and managing director of Grameen Bank, which pioneered microcredit.',
 'This is a\nmethod of banking where small loans are given to the poor, mostly to women, without collateral, for income-generating activities, to help them get out \nof poverty.',
 'The third of nine children, Prof Yunus was born in the village of Bathua, Chittagong.',
 'His father was Haji Muhammad Dula Mia Shawdagar, a jeweller, and\nhis mother was Sofia Khatun.']

In [6]:
#word tokenize
nltk.word_tokenize(corpus)

['Professor',
 'Muhammad',
 'Yunus',
 'was',
 'born',
 'on',
 'June',
 '28',
 ',',
 '1940',
 '.',
 'He',
 'is',
 'the',
 'founder',
 'and',
 'managing',
 'director',
 'of',
 'Grameen',
 'Bank',
 ',',
 'which',
 'pioneered',
 'microcredit',
 '.',
 'This',
 'is',
 'a',
 'method',
 'of',
 'banking',
 'where',
 'small',
 'loans',
 'are',
 'given',
 'to',
 'the',
 'poor',
 ',',
 'mostly',
 'to',
 'women',
 ',',
 'without',
 'collateral',
 ',',
 'for',
 'income-generating',
 'activities',
 ',',
 'to',
 'help',
 'them',
 'get',
 'out',
 'of',
 'poverty',
 '.',
 'The',
 'third',
 'of',
 'nine',
 'children',
 ',',
 'Prof',
 'Yunus',
 'was',
 'born',
 'in',
 'the',
 'village',
 'of',
 'Bathua',
 ',',
 'Chittagong',
 '.',
 'His',
 'father',
 'was',
 'Haji',
 'Muhammad',
 'Dula',
 'Mia',
 'Shawdagar',
 ',',
 'a',
 'jeweller',
 ',',
 'and',
 'his',
 'mother',
 'was',
 'Sofia',
 'Khatun',
 '.']

# Token Normalizing / Data Cleaning

# Stemming

In [7]:
from nltk.stem import PorterStemmer
import re

In [8]:
stemmer = PorterStemmer()

In [9]:
sentences = nltk.sent_tokenize(corpus)

In [10]:
stem_sentences = []
for i in range(len(sentences)):
    text = re.sub('[^a-zA-Z0-9]', ' ', sentences[i])
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    stem_sentences.append(' '.join(words))
print(stem_sentences)

['professor muhammad yunu wa born on june 28 1940', 'he is the founder and manag director of grameen bank which pioneer microcredit', 'thi is a method of bank where small loan are given to the poor mostli to women without collater for incom gener activ to help them get out of poverti', 'the third of nine children prof yunu wa born in the villag of bathua chittagong', 'hi father wa haji muhammad dula mia shawdagar a jewel and hi mother wa sofia khatun']


# Lemmatization

In [11]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
sentence_lem = []
for i in range(len(sentences)):
    text = re.sub('[^a-zA-Z0-9]', ' ', sentences[i])
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    sentence_lem.append(' '.join(words))
print(sentence_lem)

['professor muhammad yunus wa born on june 28 1940', 'he is the founder and managing director of grameen bank which pioneered microcredit', 'this is a method of banking where small loan are given to the poor mostly to woman without collateral for income generating activity to help them get out of poverty', 'the third of nine child prof yunus wa born in the village of bathua chittagong', 'his father wa haji muhammad dula mia shawdagar a jeweller and his mother wa sofia khatun']


# Stopwords Removal

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

In [16]:
stop_words = stopwords.words('english')

In [17]:
stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

# Token to Feature

# Count Vectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(binary=True, ngram_range=(1, 2))

In [20]:
corpus = []
for i in range(len(sentences)):
    text = re.sub('[^a-zA-Z0-9]', ' ', sentences[i])
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if not word in stop_words]
    print(words)
    corpus.append(' '.join(words))

['professor', 'muhammad', 'yunu', 'born', 'june', '28', '1940']
['founder', 'manag', 'director', 'grameen', 'bank', 'pioneer', 'microcredit']
['method', 'bank', 'small', 'loan', 'given', 'poor', 'mostli', 'women', 'without', 'collater', 'incom', 'gener', 'activ', 'help', 'get', 'poverti']
['third', 'nine', 'children', 'prof', 'yunu', 'born', 'villag', 'bathua', 'chittagong']
['father', 'haji', 'muhammad', 'dula', 'mia', 'shawdagar', 'jewel', 'mother', 'sofia', 'khatun']


# Applies Bag of Words

In [21]:
X = cv.fit_transform(corpus)

In [22]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 93 stored elements and shape (5, 88)>
  Coords	Values
  (0, 70)	1
  (0, 58)	1
  (0, 86)	1
  (0, 10)	1
  (0, 42)	1
  (0, 1)	1
  (0, 0)	1
  (0, 71)	1
  (0, 60)	1
  (0, 87)	1
  (0, 11)	1
  (0, 43)	1
  (0, 2)	1
  (1, 24)	1
  (1, 47)	1
  (1, 18)	1
  (1, 32)	1
  (1, 5)	1
  (1, 63)	1
  (1, 53)	1
  (1, 25)	1
  (1, 48)	1
  (1, 19)	1
  (1, 33)	1
  (1, 6)	1
  :	:
  (3, 62)	1
  (3, 14)	1
  (3, 69)	1
  (3, 12)	1
  (3, 81)	1
  (3, 9)	1
  (4, 58)	1
  (4, 22)	1
  (4, 34)	1
  (4, 20)	1
  (4, 51)	1
  (4, 72)	1
  (4, 40)	1
  (4, 56)	1
  (4, 76)	1
  (4, 44)	1
  (4, 23)	1
  (4, 35)	1
  (4, 59)	1
  (4, 21)	1
  (4, 52)	1
  (4, 73)	1
  (4, 41)	1
  (4, 57)	1
  (4, 77)	1


In [None]:
cv.vocabulary_

In [None]:
import pandas as pd

feature_names = cv.get_feature_names_out()
x_array = X.toarray()

print("Unique word list: \n",feature_names)
print("Bag of words matrix: \n",x_array)

df = pd.DataFrame(data=x_array, columns=feature_names, index=corpus)
print(df)

# TF-IDF Vectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tf_idf = TfidfVectorizer()

In [25]:
x = tf_idf.fit_transform(corpus)


In [26]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 49 stored elements and shape (5, 45)>
  Coords	Values
  (0, 36)	0.40986538560224284
  (0, 30)	0.33067681238156543
  (0, 44)	0.33067681238156543
  (0, 5)	0.33067681238156543
  (0, 21)	0.40986538560224284
  (0, 1)	0.40986538560224284
  (0, 0)	0.40986538560224284
  (1, 12)	0.38775666010579296
  (1, 24)	0.38775666010579296
  (1, 9)	0.38775666010579296
  (1, 16)	0.38775666010579296
  (1, 3)	0.3128396318588854
  (1, 32)	0.38775666010579296
  (1, 27)	0.38775666010579296
  (2, 3)	0.20393539986751064
  (2, 25)	0.2527726716084208
  (2, 38)	0.2527726716084208
  (2, 23)	0.2527726716084208
  (2, 15)	0.2527726716084208
  (2, 33)	0.2527726716084208
  (2, 28)	0.2527726716084208
  (2, 43)	0.2527726716084208
  (2, 42)	0.2527726716084208
  (2, 8)	0.2527726716084208
  (2, 19)	0.2527726716084208
  (2, 13)	0.2527726716084208
  (2, 2)	0.2527726716084208
  (2, 18)	0.2527726716084208
  (2, 14)	0.2527726716084208
  (2, 34)	0.2527726716084208
  (3, 44

In [27]:
tf_idf.vocabulary_

{'professor': 36,
 'muhammad': 30,
 'yunu': 44,
 'born': 5,
 'june': 21,
 '28': 1,
 '1940': 0,
 'founder': 12,
 'manag': 24,
 'director': 9,
 'grameen': 16,
 'bank': 3,
 'pioneer': 32,
 'microcredit': 27,
 'method': 25,
 'small': 38,
 'loan': 23,
 'given': 15,
 'poor': 33,
 'mostli': 28,
 'women': 43,
 'without': 42,
 'collater': 8,
 'incom': 19,
 'gener': 13,
 'activ': 2,
 'help': 18,
 'get': 14,
 'poverti': 34,
 'third': 40,
 'nine': 31,
 'children': 6,
 'prof': 35,
 'villag': 41,
 'bathua': 4,
 'chittagong': 7,
 'father': 11,
 'haji': 17,
 'dula': 10,
 'mia': 26,
 'shawdagar': 37,
 'jewel': 20,
 'mother': 29,
 'sofia': 39,
 'khatun': 22}

In [30]:
import pandas as pd
feature_names = tf_idf.get_feature_names_out()
x_array = x.toarray()

print("Unique word list: \n", feature_names)
print("Bag of Words Matrix: \n", x_array)

df = pd.DataFrame(data=x_array, columns=feature_names, index=corpus)
print(df)

Unique word list: 
 ['1940' '28' 'activ' 'bank' 'bathua' 'born' 'children' 'chittagong'
 'collater' 'director' 'dula' 'father' 'founder' 'gener' 'get' 'given'
 'grameen' 'haji' 'help' 'incom' 'jewel' 'june' 'khatun' 'loan' 'manag'
 'method' 'mia' 'microcredit' 'mostli' 'mother' 'muhammad' 'nine'
 'pioneer' 'poor' 'poverti' 'prof' 'professor' 'shawdagar' 'small' 'sofia'
 'third' 'villag' 'without' 'women' 'yunu']
Bag of Words Matrix: 
 [[0.40986539 0.40986539 0.         0.         0.         0.33067681
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.40986539 0.         0.
  0.         0.         0.         0.         0.         0.
  0.33067681 0.         0.         0.         0.         0.
  0.40986539 0.         0.         0.         0.         0.
  0.         0.         0.33067681]
 [0.         0.         0.         0.31283963 0.         0.
  0.         0.         0.         0.

# Sentiment Analysis using TF-IDF features and Neural Network MLP (instead of Logistic Regression)

In [31]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [32]:
# 2.Load Dataset from Kaggle

path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

# Load CSV
df = pd.read_csv(f"{path}/IMDB Dataset.csv")
print(df.head())

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [33]:
# 3.Clean Data
# Remove missing or empty reviews
df = df.dropna(subset=['review', 'sentiment'])
df = df[df['review'].str.strip() != ""]

print("NaN values left:\n", df.isna().sum())

NaN values left:
 review       0
sentiment    0
dtype: int64


In [34]:
# 4.Encode Sentiments
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])  # positive=1, negative=0

In [35]:
# 5. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'],
    test_size=0.2,
    random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 40000
Test size: 10000


In [36]:
# 6. Convert Text to Bag of Words
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

In [37]:
# 7. Build Neural Network
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_vec.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [38]:
# 8.Train the Model
es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_vec, y_train,
    epochs=6,
    batch_size=256,
    validation_split=0.1,
    callbacks=[es],
    verbose=2
)

Epoch 1/6
141/141 - 6s - 40ms/step - accuracy: 0.8442 - loss: 0.3718 - val_accuracy: 0.8840 - val_loss: 0.2862
Epoch 2/6
141/141 - 5s - 35ms/step - accuracy: 0.9115 - loss: 0.2267 - val_accuracy: 0.8840 - val_loss: 0.2941
Epoch 3/6
141/141 - 5s - 33ms/step - accuracy: 0.9411 - loss: 0.1564 - val_accuracy: 0.8835 - val_loss: 0.3194


In [39]:
# 9. Evaluate
loss, acc = model.evaluate(X_test_vec, y_test, verbose=0)
print(f"Test Loss: {loss:.4f} | Test Accuracy: {acc:.4f}")

Test Loss: 0.2862 | Test Accuracy: 0.8850


In [40]:
# 10. Predict & Report

from sklearn.metrics import classification_report

y_pred = (model.predict(X_test_vec) > 0.5).astype("int32")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      4961
           1       0.87      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.88      0.88     10000
weighted avg       0.89      0.89      0.88     10000



Accuracy around 88–90%

Classification report (Precision, Recall, F1)

A working Neural Network trained on Bag of Words features