## Bag Of Words


In [1]:
from collections import Counter

# Given list of sentences
sentences = [
    "it was the best of times",
    "it was the worst of times",
    "it was the age of wisdom",
    "it was the age of foolishness"
]

# Step 1: List all unique words
all_words = ' '.join(sentences).split()
unique_words = list(set(all_words))

# Step 2: Create word frequency table
word_freq = Counter(all_words)
sorted_word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))

# Step 3: Create document vectors
document_vectors = []
for sentence in sentences:
    sentence_words = sentence.split()
    document_vector = [1 if word in sentence_words else 0 for word in sorted_word_freq.keys()]
    document_vectors.append(document_vector)

# Print the results
print("Step 1: List of Unique Words:", unique_words)
print("\nStep 2: Word Frequency Table:")
print("Word\tFrequency")
for word, freq in sorted_word_freq.items():
    print(f"{word}\t{freq}")

print("\nStep 3: Document Vectors:")
for sentence, vector in zip(sentences, document_vectors):
    print(f"Sentence: {sentence}\nVector: {vector}\n")

Step 1: List of Unique Words: ['it', 'the', 'worst', 'of', 'was', 'age', 'times', 'best', 'foolishness', 'wisdom']

Step 2: Word Frequency Table:
Word	Frequency
it	4
was	4
the	4
of	4
times	2
age	2
best	1
worst	1
wisdom	1
foolishness	1

Step 3: Document Vectors:
Sentence: it was the best of times
Vector: [1, 1, 1, 1, 1, 0, 1, 0, 0, 0]

Sentence: it was the worst of times
Vector: [1, 1, 1, 1, 1, 0, 0, 1, 0, 0]

Sentence: it was the age of wisdom
Vector: [1, 1, 1, 1, 0, 1, 0, 0, 1, 0]

Sentence: it was the age of foolishness
Vector: [1, 1, 1, 1, 0, 1, 0, 0, 0, 1]



## TF-IDF

In [2]:
import pandas as pd
import math
from collections import Counter

def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count/len(document) for word, count in word_count.items()}
    return tf

def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N/count)
    return idf

def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]
    return tfidf

# New data
data = [
    "A quick brown fox jumps over the lazy dog What a fox",
    "A quick brown fox jumps over the lazy fox What a fox"
]

# Split data into tokens
documents = [doc.split() for doc in data]

# Compute TF for each document
tf_data = [compute_tf(doc) for doc in documents]

# Create DataFrame for TF
tf_df = pd.DataFrame(tf_data).fillna(0)
print("TF Scores:")
print(tf_df)

# Compute IDF
idf = compute_idf(documents)
idf_df = pd.DataFrame([idf]).fillna(0)
print("\nIDF Scores:")
print(idf_df)

# Compute TF-IDF for each document
tfidf_data = [compute_tfidf(doc, idf) for doc in documents]

# Create DataFrame for TF-IDF
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
print("\nTF-IDF Scores:")
print(tfidf_df)

TF Scores:
          A     quick     brown       fox     jumps      over       the  \
0  0.083333  0.083333  0.083333  0.166667  0.083333  0.083333  0.083333   
1  0.083333  0.083333  0.083333  0.250000  0.083333  0.083333  0.083333   

       lazy       dog      What         a  
0  0.083333  0.083333  0.083333  0.083333  
1  0.083333  0.000000  0.083333  0.083333  

IDF Scores:
   quick  fox  brown  the       dog    a    A  jumps  What  over  lazy
0    0.0  0.0    0.0  0.0  0.693147  0.0  0.0    0.0   0.0   0.0   0.0

TF-IDF Scores:
     A  quick  brown  fox  jumps  over  the  lazy       dog  What    a
0  0.0    0.0    0.0  0.0    0.0   0.0  0.0   0.0  0.057762   0.0  0.0
1  0.0    0.0    0.0  0.0    0.0   0.0  0.0   0.0  0.000000   0.0  0.0
