In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample corpus (list of documents)
corpus = [
    "shipment of gold damaged in a fire",
    "delivery of silver arrived in a silver truck",
    "shipment of gold arrived in a truck"
]

# Query string
query = "gold silver truck"

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the model on the corpus and transform the corpus to TF-IDF matrix
X = vectorizer.fit_transform(corpus)

# Transform the query string into a TF-IDF vector
query_vector = vectorizer.transform([query])

# Compute the dot product of the query vector with each document vector
scores = X @ query_vector.T

# Convert to dense format for display
tfidf_matrix = X.toarray()
query_tfidf_vector = query_vector.toarray()

print("TF-IDF Matrix (Corpus):")
print(tfidf_matrix)

print("\nQuery TF-IDF Vector:")
print(query_tfidf_vector)

# Display feature names (terms in the vocabulary)
print("\nFeature Names (Vocabulary):")
print(vectorizer.get_feature_names_out())

# Display similarity scores
print("\nSimilarity Scores (Query with each document):")
print(scores.toarray())



TF-IDF Matrix (Corpus):
[[0.         0.50935267 0.         0.50935267 0.38737583 0.30083189
  0.30083189 0.38737583 0.         0.        ]
 [0.29048754 0.         0.38195621 0.         0.         0.22558949
  0.22558949 0.         0.76391242 0.29048754]
 [0.43826859 0.         0.         0.         0.43826859 0.34035465
  0.34035465 0.43826859 0.         0.43826859]]

Query TF-IDF Vector:
[[0.         0.         0.         0.         0.51785612 0.
  0.         0.         0.68091856 0.51785612]]

Feature Names (Vocabulary):
['arrived' 'damaged' 'delivery' 'fire' 'gold' 'in' 'of' 'shipment'
 'silver' 'truck']

Similarity Scores (Query with each document):
[[0.20060494]
 [0.6705929 ]
 [0.45392014]]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus (list of documents)
corpus = [
    'the cat in the hat',
    'the quick brown fox',
    'the lazy dog'
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the model and transform the corpus to TF-IDF matrix
X = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to a dense format and display it
tfidf_matrix = X.toarray()
print("TF-IDF Matrix:")
print(tfidf_matrix)

# Display feature names (terms in the vocabulary)
print("\nFeature Names (Vocabulary):")
print(vectorizer.get_feature_names_out())


TF-IDF Matrix:
[[0.         0.4769856  0.         0.         0.4769856  0.4769856
  0.         0.         0.56343076]
 [0.54645401 0.         0.         0.54645401 0.         0.
  0.         0.54645401 0.32274454]
 [0.         0.         0.65249088 0.         0.         0.
  0.65249088 0.         0.38537163]]

Feature Names (Vocabulary):
['brown' 'cat' 'dog' 'fox' 'hat' 'in' 'lazy' 'quick' 'the']


In [3]:
import math

# Step 1: Calculate Term Frequency (TF)
def compute_tf(word_dict, document):
    tf_dict = {}
    document_count = len(document)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(document_count)  # TF calculation
    return tf_dict

# Step 2: Calculate Inverse Document Frequency (IDF)
def compute_idf(documents):
    N = len(documents)  # Total number of documents
    idf_dict = {}

    for document in documents:
        for word in document:
            if word not in idf_dict:
                idf_dict[word] = 0

    for document in documents:
        for word in document:
            if document[word] > 0:  # If the word exists in the document
                idf_dict[word] += 1  # Increment count of documents containing the word

    for word, val in idf_dict.items():  # Calculate IDF for each word
        idf_dict[word] = math.log2(N / float(val))  # IDF = log(Total Documents / Documents containing the word)
    return idf_dict

# Step 3: Calculate TF-IDF
def compute_tf_idf(tf, idf):
    tf_idf = {}
    for word, val in tf.items():
        if word in idf:  # Only calculate TF-IDF for words found in the IDF dictionary
            tf_idf[word] = val * idf[word]  # TF-IDF calculation
        else:
            tf_idf[word] = 0  # If the word doesn't exist in the IDF dict, set TF-IDF to 0
    return tf_idf

# Helper function to create word dictionary from document
def create_word_dict(doc):
    word_dict = {}
    for word in doc.split():
        word_dict[word] = word_dict.get(word, 0) + 1
    return word_dict

# Function to calculate query TF-IDF score for a document
def calculate_query_score(query_tf_idf, doc_tf_idf):
    score = 0.0
    for word in query_tf_idf:
        if word in doc_tf_idf:  # Check if word in query exists in the document
            score += query_tf_idf[word] * doc_tf_idf[word]  # Sum the TF-IDF product
    return score

# Sample documents (corpus)
doc1 = "shipment of gold damaged in a fire"
doc2 = "delivery of silver arrived in a silver truck"
doc3 = "shipment of gold arrived in a truck"

# List of all documents
documents = [doc1, doc2, doc3]

# Preprocess documents: Create word count dictionaries for each document
word_dicts = [create_word_dict(doc) for doc in documents]

# Calculate TF for each document in the corpus
tf_docs = [compute_tf(word_dict, doc.split()) for word_dict, doc in zip(word_dicts, documents)]

# Calculate IDF across all documents
idf = compute_idf(word_dicts)

# Calculate TF-IDF for each document
tf_idf_docs = [compute_tf_idf(tf, idf) for tf in tf_docs]

# Given query to analyze
query = "gold silver truck"

# Preprocess the query: Create a word count dictionary for the query
query_word_dict = create_word_dict(query)

# Calculate TF for the query
tf_query = compute_tf(query_word_dict, query.split())

# Calculate TF-IDF for the query using the corpus IDF
tf_idf_query = compute_tf_idf(tf_query, idf)

# Print TF-IDF for each word in each document
print("TF-IDF scores for each word in each document:")
for i, doc_tf_idf in enumerate(tf_idf_docs):
    print(f"\nDocument {i + 1}:")
    for word, score in doc_tf_idf.items():
        print(f"{word}: {score:.4f}")

# Calculate the score of each document based on query TF-IDF and print the scores
print("\nTF-IDF of the query with respect to each document:")
document_scores = []
for i, doc_tf_idf in enumerate(tf_idf_docs):
    score = calculate_query_score(tf_idf_query, doc_tf_idf)
    document_scores.append((i + 1, score))  # Store document number and its score
    print(f"Document {i + 1} Query TF-IDF Score: {score:.4f}")

# Sort documents by score in descending order
ranked_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)

# Display ranked documents with their scores
print("\nDocuments ranked based on relevance to the query:")
for doc_num, score in ranked_documents:
    print(f"Document {doc_num}: Score = {score:.4f}")


TF-IDF scores for each word in each document:

Document 1:
shipment: 0.0836
of: 0.0000
gold: 0.0836
damaged: 0.2264
in: 0.0000
a: 0.0000
fire: 0.2264

Document 2:
delivery: 0.1981
of: 0.0000
silver: 0.3962
arrived: 0.0731
in: 0.0000
a: 0.0000
truck: 0.0731

Document 3:
shipment: 0.0836
of: 0.0000
gold: 0.0836
arrived: 0.0836
in: 0.0000
a: 0.0000
truck: 0.0836

TF-IDF of the query with respect to each document:
Document 1 Query TF-IDF Score: 0.0163
Document 2 Query TF-IDF Score: 0.2236
Document 3 Query TF-IDF Score: 0.0326

Documents ranked based on relevance to the query:
Document 2: Score = 0.2236
Document 3: Score = 0.0326
Document 1: Score = 0.0163
