In [1]:
import nltk
import os
import numpy as np
import faiss
import pdfplumber
import time
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter import scrolledtext  # UPDATE: Fixed missing import
from sentence_transformers import SentenceTransformer
from langdetect import detect

In [2]:
# Initialize SentenceTransformer model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Global variables
DATASET_PATH = ""
EMBEDDINGS_FILE = "document_embeddings.npy"
FAISS_FILE = "faiss_index.bin"

In [3]:
def embed_documents(documents):
    return embedding_model.encode(documents, convert_to_numpy=True)

def generate_query_embedding(query):
    return embedding_model.encode(query, convert_to_numpy=True)

def create_faiss_index(embeddings, embedding_dim):
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(embeddings)
    return index

def save_embeddings(embeddings, file_path):
    np.save(file_path, embeddings)

def load_embeddings(file_path):
    return np.load(file_path)

def save_faiss_index(index, file_path):
    faiss.write_index(index, file_path)

def load_faiss_index(file_path):
    return faiss.read_index(file_path)

def read_pdfs_from_folder(folder_path):
    pdf_contents = []
    pdf_filenames = []
    
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Dataset folder '{folder_path}' does not exist.")
    
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in '{folder_path}'.")
    
    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        try:
            with pdfplumber.open(file_path) as pdf:
                text = "".join([page.extract_text() or "" for page in pdf.pages])
                if text.strip():
                    pdf_contents.append(text)
                    pdf_filenames.append(pdf_file)
        except Exception as e:
            print(f"Error reading PDF file {pdf_file}: {e}")
    
    return pdf_contents, pdf_filenames

In [4]:
def summarize_with_nltk_focused(document, query, max_lines=20):
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    from string import punctuation
    from heapq import nlargest
    
    sentences = sent_tokenize(document)
    english_sentences = [s for s in sentences if len(s.strip()) > 10 and detect(s) == 'en']
    if not english_sentences:
        return "No relevant English content found related to the query."
    
    stop_words = set(stopwords.words('english') + list(punctuation))
    query_terms = [term.lower() for term in query.split() if term.lower() not in stop_words]
    
    sentence_scores = {}
    for sentence in english_sentences:
        score = sum(1 for word in word_tokenize(sentence.lower()) if word in query_terms)
        if score > 0:
            sentence_scores[sentence] = score
    
    summary_sentences = nlargest(min(len(sentence_scores), max_lines), sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences) if summary_sentences else f"No content specifically about '{query}' found."

In [5]:
#UPDATE: Adjusted output display to use the full window
def process_documents(documents, query, num_docs=2):
    """Process documents and return summaries based on the query."""
    final_output = []
    for doc_idx, document in enumerate(documents[:num_docs]):
        summary = summarize_with_nltk_focused(document, query)
        if summary and not summary.startswith("No content"):
            final_output.append(f"Document {doc_idx + 1} Response:\n{summary}")
    
    if not final_output:
        return f"No relevant information found about '{query}' in the documents."
    
    return "\n\n".join(final_output)


In [6]:
def process_query():
    global DATASET_PATH
    query = query_entry.get()
    if not query:
        messagebox.showerror("Error", "Please enter a query")
        return
    if not DATASET_PATH:
        messagebox.showerror("Error", "Please select a dataset folder")
        return
    
    start_time = time.time()
    response_text.delete(1.0, tk.END)
    response_text.insert(tk.END, "Processing...\n")
    root.update()
    
    try:
        documents, filenames = read_pdfs_from_folder(DATASET_PATH)
        if os.path.exists(EMBEDDINGS_FILE):
            document_embeddings = load_embeddings(EMBEDDINGS_FILE)
        else:
            document_embeddings = embed_documents(documents)
            save_embeddings(document_embeddings, EMBEDDINGS_FILE)
        
        if os.path.exists(FAISS_FILE):
            faiss_index = load_faiss_index(FAISS_FILE)
        else:
            embedding_dim = document_embeddings.shape[1]
            faiss_index = create_faiss_index(document_embeddings, embedding_dim=embedding_dim)
            save_faiss_index(faiss_index, FAISS_FILE)
        
        query_embedding = generate_query_embedding(query).reshape(1, -1)
        distances, indices = faiss_index.search(query_embedding, k=2)
        top_documents = [documents[idx] for idx in indices[0] if idx < len(filenames)]
        
        final_output = process_documents(top_documents, query) if top_documents else "No relevant documents found."
    except Exception as e:
        final_output = f"Error: {str(e)}"
    
    elapsed_time = time.time() - start_time
    response_text.delete(1.0, tk.END)
    response_text.insert(tk.END, final_output)
    timer_label.config(text=f"Time taken: {elapsed_time:.2f} sec")

In [7]:
def select_folder():
    global DATASET_PATH
    DATASET_PATH = filedialog.askdirectory()
    folder_label.config(text=f"Selected Folder: {DATASET_PATH}")

In [8]:
# GUI Setup
root = tk.Tk()
root.title("NLP Query Interface")
root.geometry("800x600")

folder_button = tk.Button(root, text="Select Dataset Folder", command=select_folder)
folder_button.pack(pady=5)

folder_label = tk.Label(root, text="No folder selected", wraplength=700)
folder_label.pack()

query_entry = tk.Entry(root, width=70)
query_entry.pack(pady=5)
query_entry.insert(0, "Enter your query here")

submit_button = tk.Button(root, text="Submit Query", command=process_query)
submit_button.pack(pady=5)

response_text = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=100, height=20)
response_text.pack(expand=True, fill=tk.BOTH, padx=10, pady=10)

timer_label = tk.Label(root, text="")
timer_label.pack()

root.mainloop()