# Vector Space Model Implementation

**🔗 GitHub Repository:** [https://github.com/aarushroyy/IR-Vector-Space-Model](https://github.com/aarushroyy/IR-Vector-Space-Model)

In [74]:
%pip install nltk matplotlib

import os
import math
from collections import defaultdict
import re
import nltk
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from datetime import datetime
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize




[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\aarus\AppData\Roaming\nltk_data...
[

In [75]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\aarus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [76]:
def preprocess(text):
    # doing lowercase
    text = text.lower()
    
    # removing special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # removing numbers
    text = re.sub(r'\d+', '', text)
    
    # tokenizing the words
    text = word_tokenize(text)
    
    # removing stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    # stemming here
    stemmer = nltk.stem.PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    text_with_soundex = []
    for word in text:
        text_with_soundex.append(word)  # the original word
        soundex_code = soundex(word)    # the Soundex version
        if soundex_code != word:        # avoiding duplicates
            text_with_soundex.append(soundex_code)
    
    return text_with_soundex

In [77]:
def soundex(word):
    if not word:
        return ""
    
    word = word.upper()
    
    # first letter
    soundex_code = word[0]
    
    # replacing consonants with digits
    mapping = {
        'B': '1', 'F': '1', 'P': '1', 'V': '1',
        'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
        'D': '3', 'T': '3',
        'L': '4',
        'M': '5', 'N': '5',
        'R': '6'
    }
    
    # applying mapping
    for char in word[1:]:
        if char in mapping:
            code = mapping[char]
            # no duplicate consecutive codes together
            if soundex_code[-1] != code:
                soundex_code += code
    
    # removing vowels
    vowels = 'AEIOUYHW'
    filtered_code = soundex_code[0]  # again keeping first letter
    for char in soundex_code[1:]:
        if char not in vowels:
            filtered_code += char
    
    # adding with zeros or truncate to 4 characters
    filtered_code = (filtered_code + '000')[:4]
    
    return filtered_code

In [78]:
# reading documents from the dir 
def read_documents(directory):
    documents = {}
    try:
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                tokenized = preprocess(text)
                documents[filename] = tokenized
    except Exception as e:
        print('An error occurred:', e)
    return documents

In [79]:
# creating the dictionary of all words in the documents
def create_dictionary(documents):
    dictionary = set()
    for document in documents.values():
        dictionary.update(document)
    return dictionary

In [80]:
# computing term frequency

def term_frequency(documents):
    tf = defaultdict(lambda: defaultdict(int))
    for filename, tokens in documents.items():
        for token in tokens:
            tf[filename][token] += 1
    return tf

# calculating weighted term frequency

def weighted_term_frequency(tf):
    return 1 + math.log10(tf) if tf > 0 else 0

In [81]:
def calculate_document_frequencies(posting_list):
    # implementing document frequency calculation
    document_frequencies = {}
    for term, postings in posting_list.items():
        document_frequencies[term] = len(postings)
    return document_frequencies


In [82]:
# computing the inverse document frequency
def postings_list(documents, unique_words):
    # initialize postings list as a defaultdict of lists
    postings = defaultdict(list)
    
    # compute term frequency for the documents
    tf = term_frequency(documents)
    
    # iterate over each unique word
    for word in unique_words:
        # iterate over each document and its tokens
        for filename, tokens in documents.items():
            # check if the word is in the document tokens
            if word in tokens:
                try:
                    # calculate weighted term frequency
                    wt_tf = weighted_term_frequency(tf[filename][word])
                except KeyError:
                    # handle case where term is not found in the document
                    print(f"Warning: Term '{word}' not found in document {filename}")
                    wt_tf = 0
                # append the filename and weighted term frequency to the postings list
                postings[word].append((filename, wt_tf))
    
    return postings

In [83]:
# computing the document length

def doc_length(tf):
    doc_lengths = defaultdict(float)
    for filename, terms in tf.items():
        length = 0
        for term, freq in terms.items():
            length += (1 + math.log10(freq)) ** 2
        doc_lengths[filename] = math.sqrt(length)
    return doc_lengths

In [84]:
# calculating cosine similarity for docuement
def cosine_similarity(query_wt, doc_wt, doc_len, doc_id):
    similarity = {}

    # calculating dot product now of query and the document weights
    dot_product = 0
    for term in query_wt:
        if term in doc_wt:
            dot_product += query_wt[term] * doc_wt[term]
    
    # calculating magnitude of the query vector
    query_magnitude = 0
    for weight in query_wt.values():
        query_magnitude += weight ** 2
    query_magnitude = math.sqrt(query_magnitude)
    
    # checking for zero division and calculating cosine similarity
    doc_length = doc_len.get(doc_id, 0)
    denominator = query_magnitude * doc_length
    
    if denominator == 0:
        similarity[doc_id] = 0.0  # no similarity if either vector is zero
    else:
        similarity[doc_id] = dot_product / denominator
    
    return similarity


In [85]:
def calculate_tf(freq):
    # calculating term frequency using log normalization
    return 1 + math.log10(freq) if freq > 0 else 0

def calculate_idf(df, N):
    # calculating inverse document frequency
    return math.log10(N / df)

def rank_documents(documents, query, posting_list, document_frequencies, unique_words):
    N = len(documents)  # for total number of documents
    query_tokens = preprocess(query)  # preprocessing the query
    
    # handling the empty query case since zero division error bug
    if not query_tokens:
        return []
    
    unique_words_query = set(query_tokens)  # unique words in the query
    unique_words = unique_words.union(unique_words_query)  # combine unique words from documents and query
    query_vector = {}

    # calculating query tf-idf weights (ltc scheme)
    for word in unique_words:
        tf = query_tokens.count(word)  # calculating term frequency in the query
        df = document_frequencies.get(word, 0)  # here for the document frequency of the term
        if df > 0 and tf > 0:
            idf = calculate_idf(df, N)  # inverse document frequency
            query_vector[word] = calculate_tf(tf) * idf  # tf-idf weight for the query term

    # Calculate document lengths (for cosine similarity)
    doc_lengths = defaultdict(float)
    for word, postings in posting_list.items():
        for doc, log_tf in postings:
            doc_lengths[doc] += log_tf ** 2
    for doc in doc_lengths:
        doc_lengths[doc] = math.sqrt(doc_lengths[doc])  # Finalize document lengths

    # for calculating cosine similarities
    similarities = {}
    for doc_name in documents.keys():
        doc_vector = {}
        for word in unique_words:
            posting = posting_list.get(word, [])
            for doc, log_tf in posting:
                if doc == doc_name:
                    doc_vector[word] = log_tf  # documenting vector for the term

        # compute cosine similarity
        similarity = cosine_similarity(query_vector, doc_vector, doc_lengths, doc_name)
        similarities.update(similarity)

    # Sort documents by similarity and return all ranked documents
    # ranked_docs = sorted(similarities.items(), key=lambda item: (-item[1], item[0]))
    
    def extract_doc_id(filename):
        numbers = re.findall(r'\d+', filename)
        return int(numbers[0]) if numbers else 0

    # Sort by similarity (descending) then by docID (ascending)
    ranked_docs = sorted(similarities.items(), 
                        key=lambda item: (-item[1], extract_doc_id(item[0])))
    return ranked_docs



In [86]:
def save_results_as_png(query, ranked_docs, output_dir='query_results'):
    # for the output dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # check data for visualisation
    top_10_docs = ranked_docs[:10]
    if not top_10_docs:
        # no results image
        fig, ax = plt.subplots(figsize=(12, 8))
        ax.text(0.5, 0.5, f'No results found for query: "{query}"', 
                horizontalalignment='center', verticalalignment='center',
                fontsize=16, transform=ax.transAxes)
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.axis('off')
    else:
        # extracting doc names and scores
        doc_names = [doc[0].replace('.txt', '') for doc in top_10_docs]
        scores = [doc[1] for doc in top_10_docs]
        
        fig, ax = plt.subplots(figsize=(14, 10))
        
        # the bar chart
        y_pos = range(len(doc_names))
        bars = ax.barh(y_pos, scores, color='skyblue', edgecolor='navy', linewidth=1.2)
        
        # ordering the chart
        ax.set_yticks(y_pos)
        ax.set_yticklabels(doc_names, fontsize=10)
        ax.invert_yaxis() 
        ax.set_xlabel('Cosine Similarity Score', fontsize=12, fontweight='bold')
        ax.set_ylabel('Documents', fontsize=12, fontweight='bold')
        ax.set_title(f'Top 10 Document Rankings for Query: "{query}"', 
                     fontsize=14, fontweight='bold', pad=20)
        
        # adding score labels
        for i, (bar, score) in enumerate(zip(bars, scores)):
            width = bar.get_width()
            if scores:  # scores shuldn't be empty
                ax.text(width + max(scores) * 0.01, bar.get_y() + bar.get_height()/2, 
                       f'{score:.4f}', ha='left', va='center', fontsize=9)
        
        ax.grid(axis='x', alpha=0.3, linestyle='--')
        ax.set_axisbelow(True)
        
        # padding
        if scores:
            ax.set_xlim(0, max(scores) * 1.15)
        else:
            ax.set_xlim(0, 1)
    
    # query info
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    fig.text(0.02, 0.02, f'Generated on: {timestamp}', fontsize=8, style='italic')
    
    # saving the figure
    safe_query = re.sub(r'[^\w\s-]', '', query).strip()
    safe_query = re.sub(r'[-\s]+', '_', safe_query)
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f'query_{safe_query}_{timestamp_str}.png'
    filepath = os.path.join(output_dir, filename)
    
    plt.tight_layout()
    plt.savefig(filepath, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close() 
    
    return filepath

In [87]:
def index_corpus(corpus_path):
    documents = read_documents(corpus_path)
    unique_words = create_dictionary(documents)
    posting_list = postings_list(documents, unique_words)
    document_frequencies = calculate_document_frequencies(posting_list)
    
    return documents, posting_list, document_frequencies, unique_words

def search(query, documents, posting_list, document_frequencies, unique_words, save_png=True, output_dir='query_results'):
    ranked_docs = rank_documents(documents, query, posting_list, document_frequencies, unique_words)
    
    # saving the results as png
    saved_filepath = None
    if save_png:
        saved_filepath = save_results_as_png(query, ranked_docs, output_dir)
    
    # formatting the output
    formatted_results = []
    for filename, score in ranked_docs[:10]:  # getting top 10 results
        formatted_results.append(f"{filename}: {score:.4f}")
    
    return formatted_results, saved_filepath

# usage example
corpus_path = 'corpus'
documents, posting_list, document_frequencies, unique_words = index_corpus(corpus_path)

while True:
    query = input("Enter your query (exit to end): ")

    if query == 'exit':
        break
    
    ranked_docs, saved_file = search(query, documents, posting_list, document_frequencies, unique_words)

    # printing the results
    print(f"Ranked document by relevance to query: \n{query}\n")
    for result in ranked_docs:
        print(result)
    
    # information about saved image
    if saved_file:
        print(f"\nResults visualization saved as: {saved_file}")
    else:
        print("\nNo visualization was saved")
        
    print("\n")


Ranked document by relevance to query: 
Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

zomato.txt: 0.2171
swiggy.txt: 0.1444
youtube.txt: 0.0694
messenger.txt: 0.0681
instagram.txt: 0.0643
paypal.txt: 0.0584
Discord.txt: 0.0549
reddit.txt: 0.0534
bing.txt: 0.0508
Amazon.txt: 0.0486

Results visualization saved as: query_results\query_Developing_your_Zomato_business_account_and_profile_is_a_great_way_to_boost_your_restaurants_online_reputation_20250922_201849.png


Ranked document by relevance to query: 
Warwickshire, came from an ancient family and was the heiress to  some land 

shakespeare.txt: 0.1212
levis.txt: 0.0312
Amazon.txt: 0.0284
skype.txt: 0.0239
yahoo.txt: 0.0223
google.txt: 0.0210
flipkart.txt: 0.0204
blackberry.txt: 0.0191
whatsapp.txt: 0.0180
reliance.txt: 0.0171

Results visualization saved as: query_results\query_Warwickshire_came_from_an_ancient_family_and_was_the_heiress_to_some_land_20250922_201900.pn