In [1]:
# ==============================
# Enterprise Document Q&A AI Agent
# ==============================

# ==============================
# 1. Import Libraries
# ==============================
import os
import fitz  # PyMuPDF for PDF reading
from groq import Groq, Client
import requests
import xml.etree.ElementTree as ET

# ==============================
# 2. Initialize Groq LLM Client
# ==============================
client = Groq(api_key=os.getenv("GROQ_API_KEY"))  # Set your Groq API key in environment variables

# ==============================
# 3. Extract Text from PDFs
# ==============================
def extract_texts_from_folder(folder_path="enterprise_ai_agent/data"):
    pdf_texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            filepath = os.path.join(folder_path, filename)
            with fitz.open(filepath) as doc:
                text = ""
                for page in doc:
                    text += page.get_text()
                pdf_texts[filename] = text
    return pdf_texts

# ==============================
# 4. Ask LLM a Question
# ==============================
def ask_llm(question, context):
    prompt = f"""Answer the question based on the following context:

{context}

Q: {question}
A:"""
    try:
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile",  # Active model
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"LLM Error: {e}"

# ==============================
# 5. Fetch Papers from ArXiv (Bonus)
# ==============================
def query_arxiv(search_term, max_results=1):
    url = f"http://export.arxiv.org/api/query?search_query=all:{search_term}&start=0&max_results={max_results}"
    try:
        r = requests.get(url)
        r.raise_for_status()
        root = ET.fromstring(r.content)
        ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
        papers = []
        for entry in root.findall("atom:entry", ns):
            title = entry.find("atom:title", ns).text
            summary = entry.find("atom:summary", ns).text
            authors = [a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)]
            pdf_link = entry.find("atom:link[@title='pdf']", ns).attrib['href']
            papers.append({
                "title": title.strip(),
                "summary": summary.strip(),
                "authors": authors,
                "pdf_link": pdf_link
            })
        return papers
    except Exception as e:
        return f"ArXiv API Error: {e}"

# ==============================
# 6. Load PDFs
# ==============================
pdf_texts = extract_texts_from_folder("enterprise_ai_agent/data")
print(f"Loaded {len(pdf_texts)} PDFs: {list(pdf_texts.keys())}")

# ==============================
# 7. Assign One Question Per PDF
# ==============================
questions_per_pdf = {
    "sample1.pdf": "What is the conclusion of this paper?",
    "sample2.pdf": "Summarize the methodology of this paper.",
    "sample3.pdf": "What are the accuracy and F1-score reported in this paper?",
    "sample4.pdf": "What is the main hypothesis or research objective of this paper?",
    "sample5.pdf": "List the limitations or challenges mentioned in this paper."
}

# ==============================
# 8. Ask Questions and Save Results
# ==============================
output_file = "qa_results.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for doc_name, question in questions_per_pdf.items():
        if doc_name in pdf_texts:
            context = pdf_texts[doc_name][:5000]  # Limit context for LLM
            answer = ask_llm(question, context)
            result_text = f"\n📄 {doc_name} → {question}\n{answer}\n{'-'*60}\n"
            print(result_text)
            f.write(result_text)
        else:
            not_found_text = f"\n📄 {doc_name} → File not found!\n{'-'*60}\n"
            print(not_found_text)
            f.write(not_found_text)

# ==============================
# 9. Optional: ArXiv Query Example
# ==============================
search_results = query_arxiv("quantum computing", max_results=1)
for paper in search_results:
    arxiv_text = f"\nTitle: {paper['title']}\nAuthors: {', '.join(paper['authors'])}\nSummary: {paper['summary']}\nPDF: {paper['pdf_link']}\n{'-'*60}\n"
    print(arxiv_text)
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(arxiv_text)


Loaded 5 PDFs: ['sample1.pdf', 'sample2.pdf', 'sample3.pdf', 'sample4.pdf', 'sample5.pdf']

📄 sample1.pdf → What is the conclusion of this paper?
There is no specific paper to draw a conclusion from, as the text provides a list of research papers, publications, and seminar proceedings by various authors, including Chattopadhyay S and Singh S. The text does not provide the content or findings of a specific paper, but rather a bibliography of the authors' works. 

If you could provide the title of a specific paper or more context, I would be happy to try and help you understand the conclusion of that particular paper.
------------------------------------------------------------


📄 sample2.pdf → Summarize the methodology of this paper.
The methodology of this paper involves a two-step approach: 

1. **Bibliometric analysis**: The authors conducted a bibliometric analysis of 2,223 research articles related to Artificial Intelligence in Education (AIED). This analysis likely involved quant