In [1]:
import math
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Corpus
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

# Manual TF-IDF implementation

In [3]:
def manual_tfidf(corpus):
    # Step 1: Tokenization and create vocabulary
    documents = [doc.split() for doc in corpus]
    vocabulary = set()
    for doc in documents:
        vocabulary.update(doc)
    vocabulary = sorted(vocabulary)
    
    # Step 2: Calculate document frequency (DF)
    df = defaultdict(int)
    for word in vocabulary:
        for doc in documents:
            if word in doc:
                df[word] += 1
    
    # Step 3: Calculate IDF
    N = len(documents)
    idf = {}
    for word in vocabulary:
        idf[word] = math.log((N + 1) / (df[word] + 1)) + 1  # Smoothing
    
    # Step 4: Calculate TF-IDF
    tfidf_vectors = []
    for doc in documents:
        tf = defaultdict(int)
        for word in doc:
            tf[word] += 1
        
        # Normalize TF by document length
        doc_len = len(doc)
        tf_normalized = {word: count/doc_len for word, count in tf.items()}
        
        # Calculate TF-IDF for this document
        doc_tfidf = {}
        for word in vocabulary:
            tf_val = tf_normalized.get(word, 0)
            doc_tfidf[word] = tf_val * idf[word]
        
        tfidf_vectors.append(doc_tfidf)
    
    return vocabulary, tfidf_vectors

## Get manual TF-IDF results

In [5]:
vocab, manual_results = manual_tfidf(corpus)

# Scikit-learn CountVectorizer

In [11]:
count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(corpus)
count_array = count_matrix.toarray()

## Scikit-learn TfidfVectorizer

In [14]:
tfidf_vec = TfidfVectorizer(norm=None, smooth_idf=True)  # Disable L2 normalization for better comparison
tfidf_matrix = tfidf_vec.fit_transform(corpus)
tfidf_array = tfidf_matrix.toarray()

# Display results

In [17]:
print("Vocabulary:", vocab)
print("\nManual TF-IDF results:")
for i, doc in enumerate(manual_results):
    print(f"Document {i+1}:")
    for word in vocab:
        print(f"{word}: {doc[word]:.4f}", end=" | ")
    print()

print("\nScikit-learn CountVectorizer results:")
print(count_vec.get_feature_names_out())
print(count_array)

print("\nScikit-learn TfidfVectorizer results:")
print(tfidf_vec.get_feature_names_out())
print(tfidf_array)

Vocabulary: ['a', 'and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']

Manual TF-IDF results:
Document 1:
a: 0.2575 | and: 0.0000 | are: 0.0000 | bodies: 0.0000 | celestial: 0.0000 | is: 0.2575 | moon: 0.0000 | satellite: 0.0000 | star: 0.3386 | sun: 0.2575 | the: 0.2000 | 
Document 2:
a: 0.2575 | and: 0.0000 | are: 0.0000 | bodies: 0.0000 | celestial: 0.0000 | is: 0.2575 | moon: 0.2575 | satellite: 0.3386 | star: 0.0000 | sun: 0.0000 | the: 0.2000 | 
Document 3:
a: 0.0000 | and: 0.2419 | are: 0.2419 | bodies: 0.2419 | celestial: 0.2419 | is: 0.0000 | moon: 0.1840 | satellite: 0.0000 | star: 0.0000 | sun: 0.1840 | the: 0.1429 | 

Scikit-learn CountVectorizer results:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]

Scikit-learn TfidfVectorizer results:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0.         0.        