In [2]:
from collections import Counter
from math import log

In [4]:
sentences = [
    "sunshine state enjoy sunshine",
    "brown fox jump high, brown fox run",
    "sunshine state fox run fast",
]

In [5]:
vocab = set()
for sentence in sentences:
    for word in sentence.split():
        vocab.add(word)
vocab = list(vocab)


In [6]:
doc_term_matrix = []
for sentence in sentences:
    # Create a BoW vector for the sentence
    bow_vector = [0] * len(vocab)
    for word in sentence.split():
        if word in vocab:
            bow_vector[vocab.index(word)] += 1
    doc_term_matrix.append(bow_vector)


In [7]:
tf_vectors = []
for bow_vector in doc_term_matrix:
    # Normalize the BoW vector by dividing each element by the total number of words in the sentence
    tf_vector = [word_count / len(bow_vector) for word_count in bow_vector]
    tf_vectors.append(tf_vector)


In [8]:
idf_values = []
for term in vocab:
    doc_count = 0
    for bow_vector in doc_term_matrix:
        if bow_vector[vocab.index(term)] > 0:
            doc_count += 1


In [9]:
    idf_value = log(len(sentences) / doc_count)
    idf_values.append(idf_value)


In [10]:
tfidf_vectors = []
for tf_vector in tf_vectors:
    # Multiply the TF vector by the IDF values
    tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_values)]
    tfidf_vectors.append(tfidf_vector)


In [11]:
print("Vocabulary:", vocab)
print("Document-term matrix:", doc_term_matrix)
print("TF vectors:", tf_vectors)
print("IDF values:", idf_values)
print("TF.IDF vectors:", tfidf_vectors)


Vocabulary: ['brown', 'sunshine', 'jump', 'high,', 'state', 'fox', 'run', 'fast', 'enjoy']
Document-term matrix: [[0, 2, 0, 0, 1, 0, 0, 0, 1], [2, 0, 1, 1, 0, 2, 1, 0, 0], [0, 1, 0, 0, 1, 1, 1, 1, 0]]
TF vectors: [[0.0, 0.2222222222222222, 0.0, 0.0, 0.1111111111111111, 0.0, 0.0, 0.0, 0.1111111111111111], [0.2222222222222222, 0.0, 0.1111111111111111, 0.1111111111111111, 0.0, 0.2222222222222222, 0.1111111111111111, 0.0, 0.0], [0.0, 0.1111111111111111, 0.0, 0.0, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.0]]
IDF values: [1.0986122886681098]
TF.IDF vectors: [[0.0], [0.24413606414846883], [0.0]]
