# Feature Extraction / Text Representation

## BOW (Bag of Words)

In [58]:
from nltk import word_tokenize
from collections import Counter

def get_vocabulary(documents):
    """
    Returns a sorted list of unique words from all documents.
    """
    unique_words = set()
    for sentence in documents:
        tokens = word_tokenize(sentence.lower())
        unique_words.update(tokens)
    return sorted(unique_words)

def bow(documents):
    """
    Builds Bag of Words matrix for input documents using pure Python.

    Parameters:
    documents (list): A list of strings (documents or sentences).

    Returns:
    tuple: (vocabulary list, BoW matrix as list of lists)
    """
    vocabulary = get_vocabulary(documents)
    bow_matrix = []

    for doc in documents:
        tokens = word_tokenize(doc.lower())
        word_counts = Counter(tokens)
        row = [word_counts.get(word, 0) for word in vocabulary]
        bow_matrix.append(row)

    return vocabulary, bow_matrix


In [60]:
documents = [
    "Hi how are you",
    "I am fine fine"
]

vocab, matrix = bow(documents)

print("Vocabulary:", vocab)
print("BoW Matrix:")
for row in matrix:
    print(row)


Vocabulary: ['am', 'are', 'fine', 'hi', 'how', 'i', 'you']
BoW Matrix:
[0, 1, 0, 1, 1, 0, 1]
[1, 0, 2, 0, 0, 1, 0]


In [61]:
bow_matrix = []
vocabulary = get_vocabulary(documents)
documents = [
    "Hi how are you",
    "I am fine fine"
]

for doc in documents:
    tokens = word_tokenize(doc.lower())
    word_counts = Counter(tokens)
    row = [word_counts.get(word, 0) for word in vocabulary]
    bow_matrix.append(row)

In [68]:
word_counts

Counter({'fine': 2, 'i': 1, 'am': 1})

In [69]:
word_counts.get('fine')

2