In [None]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]


# Manual TF-IDF Implementation


def tokenize(text):
    return text.lower().split()

def compute_tf(doc_tokens):
    tf = {}
    total_terms = len(doc_tokens)
    for word in doc_tokens:
        tf[word] = tf.get(word, 0) + 1
    for word in tf:
        tf[word] /= total_terms
    return tf

def compute_idf(tokenized_docs):
    import math
    N = len(tokenized_docs)
    idf = {}
    all_tokens = set(word for doc in tokenized_docs for word in doc)
    for word in all_tokens:
        containing_docs = sum(1 for doc in tokenized_docs if word in doc)
        idf[word] = math.log(N / (1 + containing_docs)) + 1
    return idf

def compute_tfidf(tf, idf):
    return {word: tf[word] * idf[word] for word in tf}

tokenized_docs = [tokenize(doc) for doc in corpus]

tf_list = [compute_tf(doc) for doc in tokenized_docs]

idf = compute_idf(tokenized_docs)

tfidf_manual = [compute_tfidf(tf, idf) for tf in tf_list]

all_words = sorted(set(word for doc in tfidf_manual for word in doc))
tfidf_manual_df = pd.DataFrame([
    [round(doc.get(word, 0), 3) for word in all_words] for doc in tfidf_manual
], columns=all_words)

print("Manual TF-IDF")
print(tfidf_manual_df)

# scikit-learn CountVectorizer

print("\nCountVectorizer")
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(corpus)
df_count = pd.DataFrame(X_count.toarray(), columns=vectorizer.get_feature_names_out())
print(df_count)

# scikit-learn TfidfVectorizer

print("\nTfidfVectorizer")
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(corpus)
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())
print(df_tfidf.round(3))


Manual TF-IDF
     a    and    are  bodies  celestial   is   moon  satellite   star    sun  \
0  0.2  0.000  0.000   0.000      0.000  0.2  0.000      0.000  0.281  0.200   
1  0.2  0.000  0.000   0.000      0.000  0.2  0.200      0.281  0.000  0.000   
2  0.0  0.201  0.201   0.201      0.201  0.0  0.143      0.000  0.000  0.143   

     the  
0  0.142  
1  0.142  
2  0.102  

CountVectorizer
   and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1

TfidfVectorizer
     and    are  bodies  celestial    is   moon  satellite   star    sun  \
0  0.000  0.000   0.000      0.000  0.48  0.000      0.000  0.632  0.480   
1  0.000  0.000   0.000      0.000  0.48  0.480      0.632  0.000  0.000   
2  0.426  0.426   0.426      0.426  0.00  0.324      0.000  0.000  0.324   

     the  