In [1]:
import re

def process_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Split text into words and remove stop words
    words = [word.lower() for word in re.split(r'\W+', text) if word.lower() not in STOP_WORDS]
    return words


In [2]:
from collections import defaultdict

# Inverted index data structure
inverted_index = defaultdict(set)

def index_document(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            words = process_text(text)
            for word in words:
                inverted_index[word].add(file_path)
    except FileNotFoundError:
        print(f"File {file_path} not found.")


In [3]:
def search(query):
    # Process the input query to remove stop words and split into words
    words = process_text(query.strip())  # Strip whitespace from the query before processing
    
    # If the processed query is empty or only contained stop words, return an empty set (no results)
    if not words:
        return set()
    
    # Create a list of sets for files that contain each word in the query
    matching_sets = [inverted_index[word] for word in words if word in inverted_index]
    
    # If there are no matching sets, it means no words from the query were found in the index
    if not matching_sets:
        return set()
    
    # Return the intersection of all matching sets
    return set.intersection(*matching_sets)




In [4]:
def rank_results(results, query):
    query_words = process_text(query)
    ranked_results = sorted(
        results,
        key=lambda file: sum(text_from_file(file).count(word) for word in query_words),
        reverse=True
    )
    return ranked_results

def text_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ""


In [5]:
# Define stop words globally
STOP_WORDS = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 
    'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
    'can', "can't", 'cannot', 'could', "couldn't", 'ours', 'hers', 'his', 'its', 'theirs', 'whose', 'someone', 'something', 'anybody', 'anyone', 'anything', 'everybody', 'everyone', 'everything', 'nobody', 'no one', 'nothing',
    'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 
    'each','whereas', 'whenever', 'whilst', 'within', 'without', 
    'few', 'for', 'from', 'further', 'go',
    'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 
    'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', 
    'let', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 
    'no', 'nor', 'not', 'just', 'like', 'even', 'actually', 'probably', 'maybe', 'perhaps', 'really', 'well', 'quite', 'rather', 'almost', 'also', 'though', 'thus', 'hence', 'therefore', 'accordingly', 'consequently', 'else', 'elsewhere', 'once', 
    'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 
    'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
    'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 
    'under', 'until', 'up', 
    'very', 'am', 'is', 'are', 'was', 'were', 'be', 'being', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'shall', 'will', 'should', 'would', 'can', 'could', 'may', 'might', 'must', 'ought',
    'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 
    'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
])


In [None]:
def main():
    # Index the documents
    html_files = [
        r"C:\Users\prathamesh\Documents\page1.html",
        r"C:\Users\prathamesh\Documents\page2.html",
        r"C:\Users\prathamesh\Documents\page3.html"
    ]
    for file_path in html_files:
        index_document(file_path)
    print("Indexing completed. Ready to search.")

    # Open the output file in append mode
    with open('output.txt', 'a') as output_file:
        # Get user input for search queries
        while True:
            query = input("Enter search query (or 'exit' to stop): ").strip()
            if query == 'exit':
                break

            if query:  # If the query is not empty after stripping whitespace
                print(f"Searching for: '{query}'")
                results = search(query)
                if results:
                    ranked_results = rank_results(results, query)
                    print("Search results:", ranked_results)
                    # Write the query and results to the output file
                    output_file.write(f"Query: {query}\nResults: {ranked_results}\n\n")
                else:
                    print("No results found.")
            else:
                print("No search query provided. Please enter a valid search query.")

if __name__ == "__main__":
    main()



Indexing completed. Ready to search.
Enter search query (or 'exit' to stop): welcome
Searching for: 'welcome'
Search results: ['C:\\Users\\prathamesh\\Documents\\page1.html', 'C:\\Users\\prathamesh\\Documents\\page2.html', 'C:\\Users\\prathamesh\\Documents\\page3.html']
Enter search query (or 'exit' to stop): simple project
Searching for: 'simple project'
Search results: ['C:\\Users\\prathamesh\\Documents\\page1.html']
Enter search query (or 'exit' to stop): page three
Searching for: 'page three'
Search results: ['C:\\Users\\prathamesh\\Documents\\page2.html', 'C:\\Users\\prathamesh\\Documents\\page1.html', 'C:\\Users\\prathamesh\\Documents\\page3.html']
Enter search query (or 'exit' to stop): PROJECT
Searching for: 'PROJECT'
Search results: ['C:\\Users\\prathamesh\\Documents\\page1.html', 'C:\\Users\\prathamesh\\Documents\\page2.html', 'C:\\Users\\prathamesh\\Documents\\page3.html']
Enter search query (or 'exit' to stop): AI
Searching for: 'AI'
Search results: ['C:\\Users\\prathamesh\