In [1]:
import nltk
from nltk.corpus import state_union
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

nltk.download('state_union')

# Load the State Union sample dataset
documents = [state_union.raw(fileid) for fileid in state_union.fileids()]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Create an inverted index
inverted_index = defaultdict(list)

# Iterate through each term and document to populate the inverted index
for i, term in enumerate(terms):
    for j in range(len(documents)):
        tfidf_score = tfidf_matrix[j, i]
        if tfidf_score > 0:
            inverted_index[term].append((j, tfidf_score))

# Convert lists to tuples in the inverted index
inverted_index = {term: tuple(postings) for term, postings in inverted_index.items()}

# Display the inverted index
for term, postings in inverted_index.items():
    print(f"{term}: {postings}")


[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\Aidan\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!


000: ((1, 0.006073399224590919), (2, 0.007688638733319705), (6, 0.008975539294478611), (8, 0.037805134469650226), (9, 0.0034308654421594245), (10, 0.0028735082183169256), (14, 0.002371626163056365), (15, 0.004601881353726093), (16, 0.003799822073365352), (18, 0.004783395383629212), (19, 0.004089023278206572), (22, 0.0064445515963617245), (23, 0.007421002396659708), (24, 0.010484593158817694), (25, 0.012025837585685718), (28, 0.003066084661413502), (30, 0.004705366573188144), (31, 0.016192699291257603), (32, 0.014179518685514775), (33, 0.0026348437633122645), (35, 0.0038759657885434934), (37, 0.009273680917777336), (38, 0.007719500867805502), (40, 0.005840526654337413), (42, 0.003769593144763933), (43, 0.010746875890360103), (44, 0.0028107801498950423), (45, 0.0027887919739092554), (46, 0.0032680993820529843), (49, 0.00833676855202037), (50, 0.01523473522068027), (51, 0.005957637863187721), (52, 0.014748134455201867), (53, 0.017588711133391124), (54, 0.022551285749822608), (55, 0.024963

In [2]:
from collections import Counter

def convert_query_to_vector(query_terms, inverted_index, total_documents):
    query_vector = {}

    # Calculate document frequency (DF) for each query term
    df_query_terms = Counter(doc_idx for term in query_terms for doc_idx, _ in inverted_index.get(term, []))

    # Calculate IDF for each query term
    for term in query_terms:
        df = df_query_terms[term] if term in df_query_terms else 0
        idf = np.log((total_documents + 1) / (df + 1)) + 1  # Add 1 to avoid division by zero
        query_vector[term] = idf

    return query_vector

# Example usage:
query_terms = ["freedom", "justice", "equality"]
total_documents = len(documents)

query_vector = convert_query_to_vector(query_terms, inverted_index, total_documents)
print("Query Vector:", query_vector)


Query Vector: {'freedom': 5.189654742026425, 'justice': 5.189654742026425, 'equality': 5.189654742026425}


In [3]:
import numpy as np

def cosine_similarity(query_vector, inverted_index, total_documents):
    # Initialize a dictionary to store the dot product scores for each document
    doc_scores = {}

    # Iterate through each term in the query vector
    for term, idf_query in query_vector.items():
        # Retrieve the postings (document, TF-IDF score) for the current term from the inverted index
        postings = inverted_index.get(term, [])

        # Iterate through each posting and update the document scores
        for doc_idx, tfidf_doc in postings:
            if doc_idx not in doc_scores:
                doc_scores[doc_idx] = 0

            # Compute the dot product for the current term and update the document score
            doc_scores[doc_idx] += idf_query * tfidf_doc

    # Normalize the scores by the document vector length
    for doc_idx, score in doc_scores.items():
        doc_vector_length = np.linalg.norm([tfidf for _, tfidf in inverted_index['document']])
        doc_scores[doc_idx] /= doc_vector_length

    # Convert the result into a sorted list of document identifier and score pairs
    sorted_results = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_results

# Example usage:
query_terms = ["freedom", "justice", "equality"]
total_documents = len(documents)

# Convert the query to a vector
query_vector = convert_query_to_vector(query_terms, inverted_index, total_documents)

# Perform the search and get the sorted results
search_results = cosine_similarity(query_vector, inverted_index, total_documents)

# Display the search results
for doc_idx, score in search_results:
    print(f"Document {doc_idx}: Score {score}")


Document 0: Score 16.041734462011796
Document 41: Score 14.178311757321207
Document 59: Score 14.157989322694128
Document 63: Score 11.948892624969714
Document 13: Score 11.72522390732414
Document 14: Score 11.270400895146748
Document 60: Score 11.202105491323756
Document 44: Score 10.867119895939052
Document 64: Score 10.726708624484406
Document 11: Score 10.50535444440982
Document 15: Score 10.297977058781834
Document 21: Score 10.100872357880279
Document 6: Score 10.026006380022325
Document 47: Score 9.44831316264573
Document 43: Score 8.57378556011498
Document 7: Score 7.873461650234768
Document 42: Score 7.842615821595019
Document 3: Score 7.812383013557693
Document 58: Score 7.247427903531135
Document 5: Score 7.115078241036869
Document 40: Score 6.989299532714738
Document 9: Score 6.664973095265504
Document 46: Score 6.618448990929625
Document 20: Score 5.953630412463163
Document 8: Score 5.916332490258222
Document 35: Score 5.859396363150262
Document 18: Score 5.724241168347565