In [20]:
import math

In [21]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

Manual TF

In [22]:
def compute_tf(doc):
    tf = {}
    words = doc.split()
    total_terms = len(words)
    for word in words:
        tf[word] = tf.get(word, 0) + 1
    for word in tf:
        tf[word] /= total_terms
    return tf

Manual IDF

In [23]:
def compute_idf(corpus):
    idf = {}
    N = len(corpus)
    all_words = set(word for doc in corpus for word in doc.split())
    for word in all_words:
        containing_docs = sum(1 for doc in corpus if word in doc.split())
        idf[word] = math.log(N / containing_docs) if containing_docs > 0 else 0
    return idf

Manual TF-IDF

In [24]:
def compute_tf_idf(tf, idf):
    tf_idf = {}
    for word in tf:
        tf_idf[word] = tf[word] * idf.get(word, 0)
    return tf_idf

In [25]:
manual_tf_idf_vectors = []
idfs = compute_idf(corpus)
for doc in corpus:
    tf = compute_tf(doc)
    tf_idf = compute_tf_idf(tf, idfs)
    manual_tf_idf_vectors.append(tf_idf)

In [26]:
import pandas as pd
unique_words = sorted(set(word for doc in corpus for word in doc.split()))
manual_df = pd.DataFrame([{word: tfidf.get(word, 0) for word in unique_words} for tfidf in manual_tf_idf_vectors])
manual_df.index = ['Doc1', 'Doc2', 'Doc3']
print(manual_df.round(3))

          a    and    are  bodies  celestial     is   moon  satellite  star  \
Doc1  0.081  0.000  0.000   0.000      0.000  0.081  0.000       0.00  0.22   
Doc2  0.081  0.000  0.000   0.000      0.000  0.081  0.081       0.22  0.00   
Doc3  0.000  0.157  0.157   0.157      0.157  0.000  0.058       0.00  0.00   

        sun  the  
Doc1  0.081  0.0  
Doc2  0.000  0.0  
Doc3  0.058  0.0  


**Count Vectorizer**

In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [28]:
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(corpus).toarray()
count_df = pd.DataFrame(count_matrix, columns=count_vectorizer.get_feature_names_out(), index=['Doc1', 'Doc2', 'Doc3'])
print(count_df)

      and  are  bodies  celestial  is  moon  satellite  star  sun  the
Doc1    0    0       0          0   1     0          0     1    1    1
Doc2    0    0       0          0   1     1          1     0    0    1
Doc3    1    1       1          1   0     1          0     0    1    1


TF-IDF Vectorizer using scikit-learn

In [29]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus).toarray()
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out(), index=['Doc1', 'Doc2', 'Doc3'])
print(tfidf_df.round(3))

        and    are  bodies  celestial    is   moon  satellite   star    sun  \
Doc1  0.000  0.000   0.000      0.000  0.48  0.000      0.000  0.632  0.480   
Doc2  0.000  0.000   0.000      0.000  0.48  0.480      0.632  0.000  0.000   
Doc3  0.426  0.426   0.426      0.426  0.00  0.324      0.000  0.000  0.324   

        the  
Doc1  0.373  
Doc2  0.373  
Doc3  0.252  


In [30]:
print("Manual TF-IDF:\n", manual_df.round(3), "\n")
print("Count Vectorizer:\n", count_df, "\n")
print("TfidfVectorizer (sklearn):\n", tfidf_df.round(3), "\n")

Manual TF-IDF:
           a    and    are  bodies  celestial     is   moon  satellite  star  \
Doc1  0.081  0.000  0.000   0.000      0.000  0.081  0.000       0.00  0.22   
Doc2  0.081  0.000  0.000   0.000      0.000  0.081  0.081       0.22  0.00   
Doc3  0.000  0.157  0.157   0.157      0.157  0.000  0.058       0.00  0.00   

        sun  the  
Doc1  0.081  0.0  
Doc2  0.000  0.0  
Doc3  0.058  0.0   

Count Vectorizer:
       and  are  bodies  celestial  is  moon  satellite  star  sun  the
Doc1    0    0       0          0   1     0          0     1    1    1
Doc2    0    0       0          0   1     1          1     0    0    1
Doc3    1    1       1          1   0     1          0     0    1    1 

TfidfVectorizer (sklearn):
         and    are  bodies  celestial    is   moon  satellite   star    sun  \
Doc1  0.000  0.000   0.000      0.000  0.48  0.000      0.000  0.632  0.480   
Doc2  0.000  0.000   0.000      0.000  0.48  0.480      0.632  0.000  0.000   
Doc3  0.426  0.426 