In [1]:
# !pip install PyPDF2
# !pip install ttkthemes

### Importing Necessary Libraries

In [1]:
import os
import unicodedata
import re
from collections import defaultdict, Counter
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from PyPDF2 import PdfReader
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
from tkinter import ttk
from ttkthemes import ThemedStyle
import numpy as np
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#from mylibrary.stopword1 import StopwordManager

### Extract Text from PDF

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

### Preprocess text data

In [3]:
def preprocess_text(text, stop_words):
    normalized_text = unicodedata.normalize('NFKD', text)
    clean_text = "".join([c for c in normalized_text if not unicodedata.combining(c)])
    cleaned_text = re.sub('[.,!?:;\-="...@#_]', ' ', clean_text)
    words = word_tokenize(cleaned_text.lower())
    return [word for word in words if word not in stop_words and len(word) >= 2]

### Perform Boolean Search

In [4]:
def boolean_search(query, tf_idf_index, file_names, num_results):
    query_tokens = query.split()
    logical_operator = None
    queries = []
    for token in query_tokens:
        if token.lower() in ['and', 'or', 'not']:
            logical_operator = token.lower()
        else:
            queries.append(token)

    if logical_operator == 'and':
        search_results = perform_and_search(queries, tf_idf_index, file_names, num_results)
    elif logical_operator == 'or':
        search_results = perform_or_search(queries, tf_idf_index, file_names, num_results)
    elif logical_operator == 'not':
        search_results = perform_not_search(queries, tf_idf_index, file_names, num_results)
    else:
        search_results = perform_and_search(queries, tf_idf_index, file_names, num_results)

    return search_results

### Perform AND Search

In [5]:
def perform_and_search(queries, tf_idf_index, file_names, num_results):
    results = []
    for doc_id, fname in enumerate(file_names):
        contains_all_terms = all(tf_idf_index.get((doc_id, token), 0) > 0 for token in queries)
        if contains_all_terms:
            tf_idf_sum = sum(tf_idf_index.get((doc_id, token), 0) for token in queries)
            results.append((fname, tf_idf_sum))
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:num_results]

### Perform OR Search

In [6]:
def perform_or_search(queries, tf_idf_index, file_names, num_results):
    results = []
    for doc_id, fname in enumerate(file_names):
        contains_any_term = any(tf_idf_index.get((doc_id, token), 0) > 0 for token in queries)
        if contains_any_term:
            tf_idf_sum = sum(tf_idf_index.get((doc_id, token), 0) for token in queries)
            results.append((fname, tf_idf_sum))
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:num_results]

### Perform NOT Search

In [7]:
def perform_not_search(queries, tf_idf_index, file_names, num_results):
    results = []
    for doc_id, fname in enumerate(file_names):
        contains_first_term = tf_idf_index.get((doc_id, queries[0]), 0) > 0
        contains_second_term = tf_idf_index.get((doc_id, queries[1]), 0) > 0
        if contains_first_term and not contains_second_term:
            tf_idf_sum = sum(tf_idf_index.get((doc_id, token), 0) for token in queries)
            results.append((fname, tf_idf_sum))
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:num_results]


### Train SVC model using the provided dataset

In [8]:
def train_svc_model(dataset_path):
    # Load the dataset
    dataset = pd.read_csv(dataset_path)

    # Drop rows containing NaN values
    dataset.dropna(inplace=True)

    # Split the dataset into features (word1) and target labels (word2)
    X = dataset['word1']
    y = dataset['word2']

    # Vectorize the text data
    vectorizer = TfidfVectorizer()
    X_vectorized = vectorizer.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)

    # Train the SVC model
    svc_model = SVC()
    svc_model.fit(X_train, y_train)

    return svc_model, vectorizer, dataset


### Correct spelling errors using the trained SVC model

In [9]:
def correct_spelling_errors(query, svc_model, vectorizer, data):
    # Vectorize the input query
    query_vectorized = vectorizer.transform([query])

    # Predict the corrected word using the trained SVC model
    corrected_word = svc_model.predict(query_vectorized)[0]

    # Find the word with the highest score from the corrected predictions
    possible_corrections = data[data['word1'] == query]
    if not possible_corrections.empty:
        best_correction = possible_corrections.loc[possible_corrections['score'].idxmax()]
        return best_correction['word2']
    return corrected_word


### Perform the search based on user input

In [16]:
def search():
    query = query_entry.get().lower()
    txt_search = txt_var.get()
    pdf_search = pdf_var.get()
    num_results = int(num_results_entry.get())  # Get the number of results from the entry field

    if txt_search and pdf_search:
        messagebox.showerror("Error", "Please select either Text or PDF search, not both.")
        return

    if not query:
        messagebox.showerror("Error", "Please enter a search query.")
        return

    if not files:
        messagebox.showerror("Error", "Please load files before performing the search.")
        return

    inverted_doc_indexes = defaultdict(lambda: defaultdict(list))
    for doc_id, fname in enumerate(tqdm(files)):
        if fname.endswith('.txt'):
            with open(os.path.join(directory_entry.get(), fname), "r") as file:
                text = file.read()
        elif fname.endswith('.pdf'):
            text = extract_text_from_pdf(os.path.join(directory_entry.get(), fname))
        words = preprocess_text(text, stop_words)
        for index, word in enumerate(words):
            inverted_doc_indexes[word][doc_id].append(index)

    DF = {}
    for word in inverted_doc_indexes.keys():
        DF[word] = len([doc for doc in inverted_doc_indexes[word]])

    tf_idf = {}
    N = len(files)

    for doc_id, fname in tqdm(enumerate(files)):
        if fname.endswith('.txt'):
            with open(os.path.join(directory_entry.get(), fname), "r") as file:
                text = file.read()
        elif fname.endswith('.pdf'):
            text = extract_text_from_pdf(os.path.join(directory_entry.get(), fname))
        tokens = preprocess_text(text, stop_words)
        counter = Counter(tokens)
        words_count = len(tokens)
        
        for token in np.unique(tokens):
            tf = counter[token]
            tf = 1 + np.log(tf)
            
            if token in DF:
                df = DF[token]
            else:
                df = 0
            idf = np.log((N + 1) / (df + 1))
            
            tf_idf[doc_id, token] = tf * idf

    search_results = boolean_search(query, tf_idf, files, num_results)

    result_text.delete('1.0', tk.END)
    
    if search_results:
        for result in search_results:
            result_text.insert(tk.END, f"{result[0]}     TF-IDF: {result[1]:.2f}\n")
    else:
        result_text.insert(tk.END, "No matching documents found.")
        # If no matching documents found, suggest words using the trained SVC model
        suggested_word = correct_spelling_errors(query, svc_model, vectorizer, data)
        suggested_words_text.delete('1.0', tk.END)
        suggested_words_text.insert(tk.END, "Suggested correction:\n")
        suggested_words_text.insert(tk.END, f"{suggested_word}")


### Browse Directory

In [17]:
def browse_directory():
    directory_path = filedialog.askdirectory()
    if directory_path:
        directory_entry.delete(0, tk.END)
        directory_entry.insert(0, directory_path)


### Delete selected documents

In [18]:
def delete_documents():
    selected_items = file_list.curselection()
    if not selected_items:
        messagebox.showinfo("Information", "Please select one or more documents to delete.")
        return

    for index in selected_items:
        del files[index]
        file_list.delete(index)
    messagebox.showinfo("Information", "Selected documents have been deleted.")


### Load files from a directory

In [19]:
def load_files():
    directory = directory_entry.get()
    txt_search = txt_var.get()
    pdf_search = pdf_var.get()

    if txt_search and pdf_search:
        messagebox.showerror("Error", "Please select either Text or PDF search, not both.")
        return

    if not os.path.isdir(directory):
        messagebox.showerror("Error", "Invalid directory path.")
        return

    files.clear()
    file_list.delete(0, tk.END)

    if txt_search:
        files.extend([os.path.basename(filename) for filename in os.listdir(directory) if filename.endswith('.txt')])
    elif pdf_search:
        files.extend([os.path.basename(filename) for filename in os.listdir(directory) if filename.endswith('.pdf')])
    else:
        txt_files = [os.path.basename(filename) for filename in os.listdir(directory) if filename.endswith('.txt')]
        pdf_files = [os.path.basename(filename) for filename in os.listdir(directory) if filename.endswith('.pdf')]
        files.extend(txt_files + pdf_files)

    for file_path in files:
        file_list.insert(tk.END, os.path.basename(file_path))


In [15]:
'''
def select_language():
    language_var = tk.StringVar()
    selected_language = language_var.get()
    if selected_language == "English":
        file_path = stopword.english()
    elif selected_language == "Arabic":
        file_path = stopword.arabic()
    elif selected_language == "Bulgarian":
        file_path = stopword.bulgarian()
    elif selected_language == "Catalan":
        file_path = stopword.catalan()
    elif selected_language == "Czech":
        file_path = stopword.czech()
    elif selected_language == "Danish":
        file_path = stopword.danish()
    elif selected_language == "Dutch":
        file_path = stopword.dutch()
    elif selected_language == "Finnish":
        file_path = stopword.finnish()
    elif selected_language == "French":
        file_path = stopword.french()
    elif selected_language == "German":
        file_path = stopword.german()
    elif selected_language == "Gujarati":
        file_path =stopword.gujarati()
    elif selected_language == "Hebrew":
        file_path =stopword.hebrew()    
    elif selected_language == "Hindi":
        file_path = stopword.hindi()
    elif selected_language == "Hungarian":
        file_path = stopword.hungarian()
    elif selected_language == "Indonesian":
        file_path = stopword.indonesian()
    elif selected_language == "Italian":
        file_path = stopword.italian()
    elif selected_language == "Malaysian":
        file_path = stopword.malaysian()
    elif selected_language == "Norwegian":
        file_path = stopword.norwegian()
    elif selected_language == "Polish":
        file_path = stopword.polish()
    elif selected_language == "Portuguese":
        file_path = stopword.portuguese()
    elif selected_language == "Romanian":
        file_path = stopword.romanian()
    elif selected_language == "Russian":
        file_path = stopword.russian()
    elif selected_language == "Slovak":
        file_path = stopword.slovak()
    elif selected_language == "Spanish":
        file_path = stopword.spanish()
    elif selected_language == "Swedish":
        file_path = stopword.english()
    elif selected_language == "Turkish":
        file_path = stopword.turkish()
    elif selected_language == "Ukrainian":
        file_path = stopword.ukrainian()
    elif selected_language == "Vietnamese":
        file_path = stopword.vietnamese()
    else:
        file_path = stopword.english()  # Default if no language is selected
    return file_path
'''

'\ndef select_language():\n    language_var = tk.StringVar()\n    selected_language = language_var.get()\n    if selected_language == "English":\n        file_path = stopword.english()\n    elif selected_language == "Arabic":\n        file_path = stopword.arabic()\n    elif selected_language == "Bulgarian":\n        file_path = stopword.bulgarian()\n    elif selected_language == "Catalan":\n        file_path = stopword.catalan()\n    elif selected_language == "Czech":\n        file_path = stopword.czech()\n    elif selected_language == "Danish":\n        file_path = stopword.danish()\n    elif selected_language == "Dutch":\n        file_path = stopword.dutch()\n    elif selected_language == "Finnish":\n        file_path = stopword.finnish()\n    elif selected_language == "French":\n        file_path = stopword.french()\n    elif selected_language == "German":\n        file_path = stopword.german()\n    elif selected_language == "Gujarati":\n        file_path =stopword.gujarati()\n    e

### Set up the GUI

In [20]:
stop_words = set(stopwords.words('english'))

files = []

root = tk.Tk()
root.title("Boolean Search")
root.geometry("1200x600")

style = ThemedStyle(root)
style.set_theme("breeze")

# Load the SVC model and vectorizer
svc_model, vectorizer, data = train_svc_model("D:\Projects Git Hub\Boolean AutoCorrect\data.csv")

frame = tk.Frame(root)
frame.pack(pady=10)

language_var = tk.StringVar()
language_choices = [
    "English", "Spanish", "Arabic", "Bulgarian", "Catalan", "Czech", "Danish",
    "Dutch", "Finnish", "French", "German","hebrew", "Gujarati", "Hindi", "Hungarian",
    "Indonesian", "Italian", "Malaysian", "Norwegian", "Polish", "Portuguese",
    "Romanian", "Russian", "Slovak", "Swedish", "Turkish", "Ukrainian", "Vietnamese"
]

language_label = ttk.Label(frame, text="Select Language:", foreground="blue")
language_label.grid(row=0, column=3, padx=10, pady=5, sticky="w")

language_dropdown = ttk.Combobox(frame, width=15, textvariable=language_var, values=language_choices)
language_dropdown.grid(row=0, column=4, padx=10, pady=5)
language_dropdown.current(0)  # Set default language

directory_label = ttk.Label(frame, text="Select directory:", foreground="blue")
directory_label.grid(row=0, column=0, padx=10, pady=5, sticky="w")

directory_entry = ttk.Entry(frame, width=50)
directory_entry.grid(row=0, column=1, padx=10, pady=5)

browse_button = ttk.Button(frame, text="Browse", command=browse_directory)
browse_button.grid(row=0, column=2, padx=10, pady=5)

txt_var = tk.BooleanVar()
pdf_var = tk.BooleanVar()

txt_check = ttk.Checkbutton(frame, text="Search in Text files", variable=txt_var)
pdf_check = ttk.Checkbutton(frame, text="Search in PDF files", variable=pdf_var)
txt_check.grid(row=1, column=0, padx=10, pady=5, sticky="w")
pdf_check.grid(row=1, column=1, padx=10, pady=5, sticky="w")

load_button = ttk.Button(frame, text="Load Files", command=load_files)
load_button.grid(row=1, column=2, padx=10, pady=5)

file_list_label = ttk.Label(frame, text="Files in Directory:", foreground="blue")
file_list_label.grid(row=2, column=0, padx=10, pady=5, sticky="w")

file_list = tk.Listbox(frame, selectmode=tk.MULTIPLE, height=10, width=50)
file_list.grid(row=3, column=0, columnspan=3, padx=10, pady=5)

delete_button = ttk.Button(frame, text="Delete Selected Documents", command=delete_documents)
delete_button.grid(row=4, column=0, columnspan=3, padx=10, pady=5)

query_label = ttk.Label(frame, text="Enter search query:", foreground="green")
query_label.grid(row=5, column=0, padx=10, pady=5, sticky="w")

query_entry = ttk.Entry(frame, width=50)
query_entry.grid(row=5, column=1, padx=10, pady=5)

num_results_label = ttk.Label(frame, text="Number of results to print:", foreground="green")
num_results_label.grid(row=5, column=2, padx=10, pady=5, sticky="w")

num_results_entry = ttk.Entry(frame, width=10)
num_results_entry.insert(tk.END, "10")  # Default value
num_results_entry.grid(row=5, column=3, padx=10, pady=5)

search_button = ttk.Button(frame, text="Search", command=search)
search_button.grid(row=5, column=4, padx=10, pady=5)

result_label = ttk.Label(frame, text="Search Results:", foreground="red")
result_label.grid(row=6, column=0, padx=10, pady=5, sticky="w")

result_text = scrolledtext.ScrolledText(frame, height=10, width=60)
result_text.grid(row=7, column=0, columnspan=3, padx=10, pady=5)

# Add a new field for displaying suggested words
suggested_words_label = ttk.Label(frame, text="Suggested words:", foreground="green")
suggested_words_label.grid(row=8, column=0, padx=10, pady=5, sticky="w")

suggested_words_text = scrolledtext.ScrolledText(frame, height=3, width=60)
suggested_words_text.grid(row=9, column=0, columnspan=3, padx=10, pady=5)

root.mainloop()

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 335.14it/s]
5it [00:00, 277.52it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 330.83it/s]
5it [00:00, 319.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 406.64it/s]
5it [00:00, 323.11it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 320.66it/s]
5it [00:00, 320.04it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 301.44it/s]
5it [00:00, 165.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 313.86it/s]
5it [00:00, 878.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 395.67it/s]
5it [00:00, 255.