# Bag of Words (BoW)
In this method, we look at each word in a sentence or document and count how many times each word appears.
We ignore grammar and the order of words. 
The result is a list (or "bag") of words with their counts, which can be used for machine learning or text analysis.
For example, the sentences "I love cats" and "Cats love me" would have the same words, just in a different order, but BoW treats them the same.


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

# Download stopwords if you haven't already
nltk.download('stopwords')

# Sample dataset
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over a lazy dog quickly",
    "A fast fox is better than a slow fox"
]

# Step 1: Define stopwords
stop_words = set(stopwords.words('english'))

# Step 2: Initialize CountVectorizer with preprocessing
vectorizer = CountVectorizer(
    lowercase=True,       # Convert all characters to lowercase
    stop_words=stop_words, # Remove stopwords
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b'  # Only words with 2+ letters
)

# Step 3: Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Step 4: Get results
print("Vocabulary (Feature Names):")
print(vectorizer.get_feature_names_out())

print("\nBag of Words Matrix:")
print(X.toarray())

print("\nWord Counts per Document:")
for doc, count in zip(documents, X.toarray()):
    print(f"'{doc}'\n{count}\n")

KeyboardInterrupt: 

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Sample dataset
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over a lazy dog quickly",
    "A fast fox is better than a slow fox"
]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [13]:
# Step 1: Define stopwords
stop_words = list(stopwords.words('english'))

In [14]:
# Step 2: Initialize CountVectorizer with preprocessing
vectorizer = CountVectorizer(
    lowercase=True,       # Convert all characters to lowercase
    stop_words=stop_words, # Remove stopwords
    token_pattern=r'(?u)\b[a-zA-Z]{2,}\b'  # Only words with 2+ letters
)
vectorizer


In [15]:
# Step 3: Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Step 4: Get results
print("Vocabulary (Feature Names):")
print(vectorizer.get_feature_names_out())

Vocabulary (Feature Names):
['better' 'brown' 'dog' 'fast' 'fox' 'jump' 'jumps' 'lazy' 'never' 'quick'
 'quickly' 'slow']


In [16]:
print("\nBag of Words Matrix:")
print(X.toarray())

print("\nWord Counts per Document:")
for doc, count in zip(documents, X.toarray()):
    print(f"'{doc}'\n{count}\n")


Bag of Words Matrix:
[[0 1 1 0 1 0 1 1 0 1 0 0]
 [0 0 1 0 0 1 0 1 1 0 1 0]
 [1 0 0 1 2 0 0 0 0 0 0 1]]

Word Counts per Document:
'The quick brown fox jumps over the lazy dog'
[0 1 1 0 1 0 1 1 0 1 0 0]

'Never jump over a lazy dog quickly'
[0 0 1 0 0 1 0 1 1 0 1 0]

'A fast fox is better than a slow fox'
[1 0 0 1 2 0 0 0 0 0 0 1]

