In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]  

In [4]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

#Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

#Convert TF-IDF matrix to a dense array for better readability
dense_matrix = tfidf_matrix.todense()
tfidf_array = np.asarray(dense_matrix)

#Create a DataFrame for better visualization (optional)
import pandas as pd
df_tfidf = pd.DataFrame(data=tfidf_array, columns=feature_names)

In [5]:
print("TF-IDF Matri:")
print(df_tfidf)

TF-IDF Matri:
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [12]:
query = "This is the second document."
query_vector = vectorizer.transform([query])

# Calculate cosine similarity between the query and documents
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

#Get document indices sorted by similarity
doc_indices = np.argsort(cosine_similarities[0])[::-1]

# Print the most relevant documents
print("\nMost Relevant Documents:")
for i, idx in enumerate(doc_indices):
    print(f"{i + 1}. Document {idx + 1} - Similarity: {cosine_similarities[0][idx]}")


Most Relevant Documents:
1. Document 2 - Similarity: 0.9504966697449269
2. Document 4 - Similarity: 0.6042228489340653
3. Document 1 - Similarity: 0.6042228489340653
4. Document 3 - Similarity: 0.2803733831611419


In [24]:
import math
from collections import Counter

def calculate_tf(term_frequency):
    # Calculate term frequency (TF)
    total_terms = sum(term_frequency.values())
    tf = {term: freq / total_terms for term, freq in term_frequency.items()}
    return tf

def calculate_idf(documents):
    # Calculate inverse document frequency (IDF)
    total_documents = len(documents)
    document_frequency = Counter()

    for document in documents:
        unique_terms = set(document)
        document_frequency.update(unique_terms)

    idf = {term: math.log(total_documents / (document_frequency[term] + 1)) for term in document_frequency}
    return idf

def calculate_tfidf(documents):
    tfidf_matrix = []

    for document in documents:
        term_frequency = Counter(document)
        tf = calculate_tf(term_frequency)
        idf = calculate_idf(documents)

        # Calculate TF-IDF
        tfidf = {term: tf[term] * idf[term] for term in tf}
        tfidf_matrix.append(tfidf)

    return tfidf_matrix

# Example documents
documents = [
    ["this", "is", "the", "first", "document"],
    ["this", "document", "is", "the", "second", "document"],
    ["and", "this", "is", "the", "third", "one"],
    ["is", "this", "the", "first", "document"],
]

# Calculate TF-IDF
tfidf_result = calculate_tfidf(documents)

# Display TF-IDF matrix
for i, tfidf in enumerate(tfidf_result, start=1):
    print(f"Document {i} TF-IDF: {tfidf}")

Document 1 TF-IDF: {'this': -0.044628710262841945, 'is': -0.044628710262841945, 'the': -0.044628710262841945, 'first': 0.05753641449035617, 'document': 0.0}
Document 2 TF-IDF: {'this': -0.03719059188570162, 'document': 0.0, 'is': -0.03719059188570162, 'the': -0.03719059188570162, 'second': 0.11552453009332421}
Document 3 TF-IDF: {'and': 0.11552453009332421, 'this': -0.03719059188570162, 'is': -0.03719059188570162, 'the': -0.03719059188570162, 'third': 0.11552453009332421, 'one': 0.11552453009332421}
Document 4 TF-IDF: {'is': -0.044628710262841945, 'this': -0.044628710262841945, 'the': -0.044628710262841945, 'first': 0.05753641449035617, 'document': 0.0}
