In [None]:
import os
from collections import defaultdict
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

books_dir = 'BOOKS'
files = os.listdir(books_dir)
files.sort(key=lambda x: int(x.split('.')[0]))

for file in tqdm(files, desc="Loading files", unit="file"):
    file_path = os.path.join(books_dir, file)
    print('\n' + file_path)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Loading files: 100%|██████████| 44/44 [00:00<00:00, 36058.89file/s]


BOOKS/1. THE SONNETS.txt

BOOKS/2. ALLS WELL THAT ENDS WELL.txt

BOOKS/3. THE TRAGEDY OF ANTONY AND CLEOPATRA.txt

BOOKS/4. AS YOU LIKE IT.txt

BOOKS/5. THE COMEDY OF ERRORS.txt

BOOKS/6. THE TRAGEDY OF CORIOLANUS.txt

BOOKS/7. CYMBELINE.txt

BOOKS/8. THE TRAGEDY OF HAMLET PRINCE OF DENMARK.txt

BOOKS/9. THE FIRST PART OF KING HENRY THE FOURTH.txt

BOOKS/10. THE SECOND PART OF KING HENRY THE FOURTH.txt

BOOKS/11. THE LIFE OF KING HENRY THE FIFTH.txt

BOOKS/12. THE FIRST PART OF HENRY THE SIXTH.txt

BOOKS/13. THE SECOND PART OF KING HENRY THE SIXTH.txt

BOOKS/14. THE THIRD PART OF KING HENRY THE SIXTH.txt

BOOKS/15. KING HENRY THE EIGHTH.txt

BOOKS/16. THE LIFE AND DEATH OF KING JOHN.txt

BOOKS/17. THE TRAGEDY OF JULIUS CAESAR.txt

BOOKS/18. THE TRAGEDY OF KING LEAR.txt

BOOKS/19. LOVES LABOURS LOST.txt

BOOKS/20. THE TRAGEDY OF MACBETH.txt

BOOKS/21. MEASURE FOR MEASURE.txt

BOOKS/22. THE MERCHANT OF VENICE.txt

BOOKS/23. THE MERRY WIVES OF WINDSOR.txt

BOOKS/24. A MIDSUMMER NIGHTS DR




In [None]:
def load_files(directory):
    files = os.listdir(directory)
    files.sort(key=lambda x: int(x.split('.')[0]))
    return files

def preprocess_text(text):
    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words

def build_word_to_files(files, directory):
    word_to_files = defaultdict(set)
    for file in tqdm(files, desc="Loading files", unit="file"):
        file_path = os.path.join(directory, file)
        with open(file_path, 'r') as f:
            words = preprocess_text(f.read())
            for word in words:
                word_to_files[word].add(file_path)
    return word_to_files

def find_closest_word(word, word_to_files):
    closest_word = min(word_to_files.keys(), key=lambda x: Levenshtein.distance(word, x))
    return closest_word

def process_query(query, word_to_files):
    query_parts = query.split()
    matching_files = set()

    for i, part in enumerate(query_parts):
        if part.lower() == 'and' and i > 0 and i < len(query_parts) - 1:
            continue
        elif part.lower() == 'or' and i > 0 and i < len(query_parts) - 1:
            continue

        word = preprocess_text(part)[0]
        if word in word_to_files:
            if i > 0 and query_parts[i - 1].lower() == 'and':
                matching_files.intersection_update(word_to_files[word])
            elif i > 0 and query_parts[i - 1].lower() == 'or':
                matching_files.update(word_to_files[word])
            else:
                matching_files = word_to_files[word]
        else:
            closest_word = find_closest_word(word, word_to_files)
            print(f"Word '{word}' not found. Using closest match: '{closest_word}'")
            matching_files.update(word_to_files[closest_word])

    return list(matching_files)

def main():
    books_dir = 'BOOKS'
    files = load_files(books_dir)
    word_to_files = build_word_to_files(files, books_dir)

    query = input("Enter your query: ")
    matching_files = process_query(query, word_to_files)

    print("\nMatching files:")
    for file in matching_files:
        print(file)

if __name__ == "__main__":
    main()

Loading files: 100%|██████████| 44/44 [00:24<00:00,  1.82file/s]


Enter your query: hamlet AND prince

Matching files:
BOOKS/8. THE TRAGEDY OF HAMLET PRINCE OF DENMARK.txt
