## TF-IDF from scratch

In [24]:
'''Import Numpy'''
import numpy as np

''''' Example document '''''
document = "The quick brown fox jumps over the lazy dog. The quick brown fox is a good jumper."

'''Split document into individual words
'''
words = document.lower().split()


'''Getting vocabolary'''
unique_words = list(set(words))

'''Count number of occurrences of each word in document
'''
word_counts = [words.count(word) for word in unique_words]

'''Calculate term frequency (TF)
'''
tf = np.array(word_counts) / len(words)

'''Calculate document frequency (DF)
'''
df = np.array([np.sum([word in d.lower().split() for d in [document]]) for word in unique_words])

''' Calculate inverse document frequency (IDF)
'''
idf = np.log(len(words) / (1 + df))

''' Calculate TF-IDF
'''
tf_idf = tf * idf

# Print results
print("Words:", unique_words)
print("TF:", tf)
print("DF:", df)
print("IDF:", idf)
print("TF-IDF:", tf_idf)


Words: ['jumper.', 'good', 'jumps', 'the', 'quick', 'over', 'lazy', 'dog.', 'is', 'fox', 'a', 'brown']
TF: [0.05882353 0.05882353 0.05882353 0.17647059 0.11764706 0.05882353
 0.05882353 0.05882353 0.05882353 0.11764706 0.05882353 0.11764706]
DF: [1 1 1 1 1 1 1 1 1 1 1 1]
IDF: [2.14006616 2.14006616 2.14006616 2.14006616 2.14006616 2.14006616
 2.14006616 2.14006616 2.14006616 2.14006616 2.14006616 2.14006616]
TF-IDF: [0.12588624 0.12588624 0.12588624 0.37765873 0.25177249 0.12588624
 0.12588624 0.12588624 0.12588624 0.25177249 0.12588624 0.25177249]


## TF-IDF using Sklearn

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the corpus
corpus = [
    "This is the first document.",
    "This is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus
vectorizer.fit(corpus)

# Transform the corpus into a TF-IDF matrix
tfidf_matrix = vectorizer.transform(corpus)

# Print the TF-IDF matrix

print(tfidf_matrix.toarray())


[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.42796959 0.         0.34989318 0.         0.67049706
  0.34989318 0.         0.34989318]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## TF-IDF using Gensim

In [44]:
from gensim import corpora
from gensim.models import TfidfModel

# Define the corpus
corpus = [
    "This is the first document.",
    "This is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Tokenize the corpus
tokenized_corpus = [doc.split() for doc in corpus]

# Create the dictionary from the tokenized corpus
dictionary = corpora.Dictionary(tokenized_corpus)

# Create the bag-of-words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

# Create the TF-IDF model
tfidf_model = TfidfModel(bow_corpus)

# Calculate the TF-IDF scores for the corpus
tfidf_corpus = tfidf_model[bow_corpus]

# Print the TF-IDF scores
for doc in tfidf_corpus:
    print(doc)


[(0, 0.5614561943922499), (1, 0.5614561943922499), (2, 0.5614561943922499), (3, 0.23302537487517574)]
[(0, 0.40251125658964493), (1, 0.40251125658964493), (3, 0.16705726536655122), (5, 0.8050225131792899)]
[(3, 0.11435555192640605), (6, 0.551061299883055), (7, 0.551061299883055), (8, 0.551061299883055), (9, 0.2755306499415275)]
[(2, 0.31622776601683794), (9, 0.31622776601683794), (10, 0.6324555320336759), (11, 0.6324555320336759)]


## Bag of words(BoW) from Scratch

In [45]:
import numpy as np

# Example documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The brown fox is quick and the blue dog is lazy.",
    "The sky is blue and the sun is bright."
]

# Create a vocabulary of all unique words in the documents
vocabulary = list(set(word for doc in documents for word in doc.split()))

# Create a numpy array to store the bag-of-words representation of each document
bag_of_words = np.zeros((len(documents), len(vocabulary)), dtype=np.int32)

# Iterate over each document and update its bag-of-words representation
for i, doc in enumerate(documents):
    words = doc.split()
    for j, word in enumerate(vocabulary):
        count = words.count(word)
        bag_of_words[i, j] = count

# Print the vocabulary and bag-of-words representation for each document
print("Vocabulary:", vocabulary)
for i, doc in enumerate(documents):
    print(f"Document {i+1}: {doc}")
    print("Bag-of-words:", bag_of_words[i])


Vocabulary: ['blue', 'sun', 'dog', 'jumps', 'lazy.', 'sky', 'bright.', 'quick', 'The', 'the', 'over', 'lazy', 'dog.', 'is', 'fox', 'and', 'brown']
Document 1: The quick brown fox jumps over the lazy dog.
Bag-of-words: [0 0 0 1 0 0 0 1 1 1 1 1 1 0 1 0 1]
Document 2: The brown fox is quick and the blue dog is lazy.
Bag-of-words: [1 0 1 0 1 0 0 1 1 1 0 0 0 2 1 1 1]
Document 3: The sky is blue and the sun is bright.
Bag-of-words: [1 1 0 0 0 1 1 0 1 1 0 0 0 2 0 1 0]


## BOW using Sklearn

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

# Example documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The brown fox is quick and the blue dog is lazy.",
    "The sky is blue and the sun is bright."
]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform the documents into a bag-of-words representation
bag_of_words = vectorizer.fit_transform(documents)

# Print the vocabulary and bag-of-words representation for each document
print("Vocabulary:", vectorizer.get_feature_names())
for i, doc in enumerate(documents):
    print(f"Document {i+1}: {doc}")
    print("Bag-of-words:", bag_of_words[i].toarray())


Vocabulary: ['and', 'blue', 'bright', 'brown', 'dog', 'fox', 'is', 'jumps', 'lazy', 'over', 'quick', 'sky', 'sun', 'the']
Document 1: The quick brown fox jumps over the lazy dog.
Bag-of-words: [[0 0 0 1 1 1 0 1 1 1 1 0 0 2]]
Document 2: The brown fox is quick and the blue dog is lazy.
Bag-of-words: [[1 1 0 1 1 1 2 0 1 0 1 0 0 2]]
Document 3: The sky is blue and the sun is bright.
Bag-of-words: [[1 1 1 0 0 0 2 0 0 0 0 1 1 2]]


## The end 