In [19]:
"""Here we used the same testcases as given in the file in the blackboard"""


import math
from collections import defaultdict

class VectorSpaceModel:
    def __init__(self):
        # Dictionary where term maps to a list of (document ID, term frequency)
        self.index = defaultdict(list)
        # Length of each document for normalization
        self.document_lengths = defaultdict(float)
        # Total number of documents in the collection
        self.num_docs = 0
        # Mapping from document ID to document name (file name)
        self.doc_names = {}

    def index_document(self, doc_id, doc_name, word_list):
        """Index a document with its term frequencies and store the document's name."""
        term_freqs = defaultdict(int)
        self.num_docs += 1
        self.doc_names[doc_id] = doc_name

        # Count term frequencies
        for word in word_list:
            term_freqs[word] += 1

        # Add term frequencies to index
        for word, freq in term_freqs.items():
            self.index[word].append((doc_id, freq))

        # Compute document length for normalization
        length = sum((1 + math.log10(freq)) ** 2 for freq in term_freqs.values())
        self.document_lengths[doc_id] = math.sqrt(length)

    def compute_idf(self, word):
        """Calculate inverse document frequency (IDF) for a word."""
        doc_freq = len(self.index[word])
        if doc_freq == 0:
            return 0
        return math.log10(self.num_docs / doc_freq)

    def compute_tf_idf(self, term_freq, doc_freq):
        """Compute TF-IDF score based on term frequency and document frequency."""
        return (1 + math.log10(term_freq)) * self.compute_idf(doc_freq)

    def normalize_query(self, query_terms):
        """Create a normalized vector for the query using TF-IDF scores."""
        term_freqs = defaultdict(int)
        for term in query_terms:
            term_freqs[term] += 1

        query_vector = {}
        query_length = 0
        for term, freq in term_freqs.items():
            if term in self.index:
                tf_idf_value = (1 + math.log10(freq)) * self.compute_idf(term)
                query_vector[term] = tf_idf_value
                query_length += tf_idf_value ** 2

        query_length = math.sqrt(query_length)
        for term in query_vector:
            query_vector[term] /= query_length

        return query_vector

    def calculate_similarity(self, query_vector, doc_id):
        """Calculate cosine similarity between the query and a document."""
        similarity = 0
        for term, query_weight in query_vector.items():
            postings = self.index[term]
            for doc, term_freq in postings:
                if doc == doc_id:
                    doc_weight = 1 + math.log10(term_freq)
                    similarity += query_weight * doc_weight

        return similarity / self.document_lengths[doc_id]

    def rank_documents(self, query_terms):
        """Rank documents by similarity to the query and return the top 10 results."""
        query_vector = self.normalize_query(query_terms)
        doc_scores = defaultdict(float)

        for term in query_vector:
            postings = self.index[term]
            for doc_id, term_freq in postings:
                doc_scores[doc_id] += self.calculate_similarity(query_vector, doc_id)

        ranked_docs = sorted(doc_scores.items(), key=lambda x: (-x[1], x[0]))
        return [(self.doc_names[doc_id], score) for doc_id, score in ranked_docs[:10]]


# Function to load document content
def load_documents():
    doc_content = {
        "zomato.txt": "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation.",
        "instagram.txt": "Instagram is widely used for building business profiles and engaging with customers.",
        "swiggy.txt": "Swiggy can help you increase your online presence and attract more customers.",
        "messenger.txt": "Messenger platforms can be a powerful tool for customer engagement.",
        "whatsapp.txt": "WhatsApp allows for direct interaction with customers, increasing brand presence.",
        "shakespeare.txt": "Warwickshire, came from an ancient family and was the heiress to some land.",
        "nike.txt": "Nike has a long history of innovation and design in sportswear.",
        "volkswagen.txt": "Volkswagen is a global leader in automotive design.",
        "motorola.txt": "Motorola revolutionized the mobile phone industry with innovative products.",
        "paypal.txt": "PayPal provides a secure and easy way to handle online transactions.",
        "operating.txt": "Operating systems are fundamental to the functioning of computers."
    }

    return doc_content


# Main function to execute the process
def main():
    vsm = VectorSpaceModel()
    documents = load_documents()

    # Add each document to the vector space model
    for doc_id, (doc_name, content) in enumerate(documents.items(), start=1):
        words = content.lower().split()  # Tokenize content
        vsm.index_document(doc_id, doc_name, words)

    # Example queries
    queries = [
        "Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation",
        "Warwickshire, came from an ancient family and was the heiress to some land"
    ]

    for i, query in enumerate(queries, start=1):
        query_terms = query.lower().split()  # Tokenize the query
        ranked_docs = vsm.rank_documents(query_terms)
        print(f"Query {i}: {query}")
        for rank, (doc_name, score) in enumerate(ranked_docs, start=1):
            print(f"{rank}. ('{doc_name}', {score})")
        print("\n")


if __name__ == "__main__":
    main()


Query 1: Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
1. ('zomato.txt', 13.733532128933936)
2. ('paypal.txt', 1.0863426046257378)
3. ('swiggy.txt', 0.47561807136786993)
4. ('instagram.txt', 0.41642397045854473)
5. ('volkswagen.txt', 0.19662525431305533)
6. ('shakespeare.txt', 0.11952093253972164)
7. ('nike.txt', 0.11201056721619088)
8. ('operating.txt', 0.04491234885963554)
9. ('messenger.txt', 0.03320901719844228)


Query 2: Warwickshire, came from an ancient family and was the heiress to some land
1. ('shakespeare.txt', 11.011346010185404)
2. ('operating.txt', 0.2080586176747389)
3. ('paypal.txt', 0.13174687932980597)
4. ('zomato.txt', 0.1069482439155939)
5. ('motorola.txt', 0.058489961814339735)
6. ('nike.txt', 0.02468155040858179)
7. ('instagram.txt', 0.02363078542263073)
8. ('swiggy.txt', 0.02363078542263073)


