In [1]:
import re
import math
from collections import defaultdict


In [2]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text into words
    words = text.split()
    return words

def create_bow(texts):
    # Create a vocabulary set
    vocabulary = set()
    for text in texts:
        words = preprocess_text(text)
        vocabulary.update(words)

    # Sort vocabulary to ensure consistent ordering
    vocabulary = sorted(vocabulary)

    # Create the BoW representation
    bow = []
    for text in texts:
        words = preprocess_text(text)
        text_bow = {word: words.count(word) for word in vocabulary}
        bow.append(text_bow)

    return bow, vocabulary


In [3]:

def calculate_tf(bow, vocabulary):
    tf = []
    for text_bow in bow:
        text_tf = {}
        word_count = sum(text_bow.values())
        for word in vocabulary:
            text_tf[word] = text_bow.get(word, 0) / word_count
        tf.append(text_tf)
    return tf



In [4]:
def calculate_idf(bow, vocabulary):
    num_docs = len(bow)
    idf = {}
    for word in vocabulary:
        doc_count = sum(1 for doc in bow if doc.get(word, 0) > 0)
        idf[word] = math.log((num_docs + 1) / (doc_count + 1)) + 1  # Smoothed IDF
    return idf



In [5]:
def calculate_tf_idf(tf, idf):
    tf_idf = []
    for text_tf in tf:
        text_tf_idf = {word: text_tf[word] * idf[word] for word in text_tf}
        tf_idf.append(text_tf_idf)
    return tf_idf


In [6]:
# Example usage
texts = [
    "The rain in Spain falls mainly on the plain.",
    "The sun shines brightly in the summer sky.",
    "A dog barked loudly in the quiet night."
]

# Generate BoW and vocabulary
bow, vocabulary = create_bow(texts)



In [7]:
# Calculate TF, IDF, and TF-IDF
tf = calculate_tf(bow, vocabulary)
idf = calculate_idf(bow, vocabulary)
tf_idf = calculate_tf_idf(tf, idf)

print("Vocabulary:", vocabulary)
print("TF-IDF representation:")
for text_tf_idf in tf_idf:
    print(text_tf_idf)

Vocabulary: ['a', 'barked', 'brightly', 'dog', 'falls', 'in', 'loudly', 'mainly', 'night', 'on', 'plain', 'quiet', 'rain', 'shines', 'sky', 'spain', 'summer', 'sun', 'the']
TF-IDF representation:
{'a': 0.0, 'barked': 0.0, 'brightly': 0.0, 'dog': 0.0, 'falls': 0.1881274645066606, 'in': 0.1111111111111111, 'loudly': 0.0, 'mainly': 0.1881274645066606, 'night': 0.0, 'on': 0.1881274645066606, 'plain': 0.1881274645066606, 'quiet': 0.0, 'rain': 0.1881274645066606, 'shines': 0.0, 'sky': 0.0, 'spain': 0.1881274645066606, 'summer': 0.0, 'sun': 0.0, 'the': 0.2222222222222222}
{'a': 0.0, 'barked': 0.0, 'brightly': 0.21164339756999317, 'dog': 0.0, 'falls': 0.0, 'in': 0.125, 'loudly': 0.0, 'mainly': 0.0, 'night': 0.0, 'on': 0.0, 'plain': 0.0, 'quiet': 0.0, 'rain': 0.0, 'shines': 0.21164339756999317, 'sky': 0.21164339756999317, 'spain': 0.0, 'summer': 0.21164339756999317, 'sun': 0.21164339756999317, 'the': 0.25}
{'a': 0.21164339756999317, 'barked': 0.21164339756999317, 'brightly': 0.0, 'dog': 0.21164