In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pandl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [7]:
import pdfplumber
import chromadb
import ollama
import re
import os
import time
import hashlib
import nltk
from nltk.util import ngrams
from collections import Counter

# Download necessary NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading required NLTK resources...")
    nltk.download('punkt', quiet=True)

# Function to generate a unique collection name based on the PDF path
def get_collection_name(pdf_path):
    """Generate a unique collection name based on the PDF filename"""
    pdf_filename = os.path.basename(pdf_path)
    # Create a hash of the filename to ensure unique collection names
    filename_hash = hashlib.md5(pdf_filename.encode()).hexdigest()[:10]
    return f"doc_collection_{filename_hash}"

# Initialize ChromaDB client
DB_PATH = "./chromadb_improved_store"
os.makedirs(DB_PATH, exist_ok=True)
chroma_client = chromadb.PersistentClient(path=DB_PATH)

# Improved system prompt with narrative flow instructions
SYSTEM_PROMPT = """
You are an expert academic content analyzer tasked with extracting highly relevant information from scientific documents.
For the given query about a specific section, focus ONLY on extracting the most relevant information from the provided document chunks.

IMPORTANT GUIDELINES:
1. Create a natural narrative flow - avoid question-answer format
2. Do NOT use subsection headers with asterisks or other special formatting
3. Provide factual content from the document only, no interpretations
4. Maintain academic tone and technical accuracy
5. Format response as cohesive paragraphs with smooth transitions
6. Ignore publication metadata, acknowledgments, and references unless explicitly requested
7. If the provided text doesn't contain relevant information for the query, state this clearly

Remember that your output will be used to create a presentation slide, so prioritize clarity, narrative flow, and relevance.
"""

# More detailed and specific queries for better extraction
PRESENTATION_QUERIES = {
    "Title & Introduction": "Extract the title of this scientific paper, main research question/objective, and background context provided in the introduction section.",
    
    "Key Topics & Sections": "What are the main topics covered in this document? Identify and describe each major section. Focus on the core subject areas and main themes.",
    
    "Definitions & Key Terms": "What specialized terminology, concepts, or models are defined in this document? Extract the important technical terms and their definitions.",
    
    "Methods & Approaches": "What specific methodologies, techniques, algorithms or research approaches are described? Include details about experimental design, data collection, and analytical methods.",
    
    "Findings & Results": "What are the primary results or findings presented in this document? Include specific outcomes, measurements, statistical results, and key discoveries.",
    
    "Important Statistics & Data": "What quantitative data, metrics, percentages, or statistical analyses are presented? Extract specific numbers, measurements, and their context.",
    
    "Applications & Use Cases": "How are the findings or methods applied in real-world contexts? What practical applications or implementations are discussed?",
    
    "Challenges & Limitations": "What limitations, constraints, or challenges are acknowledged in the research? What are the identified weaknesses or areas for improvement?",
    
    "Future Scope & Recommendations": "What suggestions for future research are mentioned? What recommendations or next steps are proposed?",
    
    "Conclusion & Summary": "What are the main conclusions or key takeaways? How does the document summarize its contributions and significance?"
}

# Function to clean and preprocess text
def clean_text(text):
    """Clean and normalize text."""
    if not text or not isinstance(text, str):
        return ""
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove page numbers and headers/footers (patterns like "Page X of Y")
    text = re.sub(r'Page \d+ of \d+', '', text)
    
    # Remove reference notations like [1], [2,3], etc.
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)
    
    # Remove common PDF artifacts
    text = re.sub(r'https?://\S+', '', text)  # URLs
    
    return text

# Function to chunk text into semantic units
def chunk_text(text, max_chunk_size=500, overlap=100):
    """Split text into overlapping chunks of roughly equal size while respecting sentence boundaries."""
    if not text:
        return []
    
    # Split into sentences (simple approach)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # If adding this sentence would exceed chunk size and we already have content
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            chunks.append(current_chunk.strip())
            # Keep some overlap for context
            overlap_point = max(0, len(current_chunk) - overlap)
            current_chunk = current_chunk[overlap_point:] + sentence
        else:
            current_chunk += " " + sentence
    
    # Don't forget the last chunk
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    
    return chunks

# Function to extract and store text from PDF with improved chunking
# Function to extract and store text from PDF with improved chunking
def store_pdf_in_chromadb(pdf_path):
    """Extract text from PDF and store in a document-specific ChromaDB collection"""
    print(f"Processing PDF: {pdf_path}")
    
    # Generate a unique collection name based on the PDF filename
    collection_name = get_collection_name(pdf_path)
    
    # Try to get existing collection or create new one
    try:
        # Check if collection exists
        existing_collections = chroma_client.list_collections()
        collection_exists = any(c.name == collection_name for c in existing_collections)
        
        if collection_exists:
            print(f"Collection {collection_name} already exists. Will use existing collection.")
            collection = chroma_client.get_collection(name=collection_name)
            
            # Extract full text for ROUGE evaluation (we still need this)
            full_text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text += " " + text
            full_text = clean_text(full_text)
            
            return collection_name, full_text
        else:
            # Create new collection
            collection = chroma_client.create_collection(name=collection_name)
            print(f"Created new collection: {collection_name}")
            
            # Extract full text from PDF
            full_text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text += " " + text
            
            # Clean the full text
            full_text = clean_text(full_text)
            
            # Chunk the text into semantic units
            chunks = chunk_text(full_text)
            print(f"Created {len(chunks)} semantic chunks from document")
            
            # Store chunks in ChromaDB with meaningful ids
            for i, chunk in enumerate(chunks):
                collection.add(
                    documents=[chunk],
                    metadatas=[{"chunk_id": i, "source": os.path.basename(pdf_path)}],
                    ids=[f"chunk_{i}"]
                )
            
            print(f"Successfully stored {len(chunks)} chunks in ChromaDB collection '{collection_name}'")
            return collection_name, full_text
            
    except Exception as e:
        print(f"Error handling collection: {e}")
        
        # As a fallback, try a different collection name with a timestamp
        fallback_name = f"{collection_name}_{int(time.time())}"
        print(f"Attempting fallback with collection name: {fallback_name}")
        
        try:
            collection = chroma_client.create_collection(name=fallback_name)
            
            # Extract full text from PDF
            full_text = ""
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        full_text += " " + text
            
            # Clean the full text
            full_text = clean_text(full_text)
            
            # Chunk the text into semantic units
            chunks = chunk_text(full_text)
            
            # Store chunks in ChromaDB with meaningful ids
            for i, chunk in enumerate(chunks):
                collection.add(
                    documents=[chunk],
                    metadatas=[{"chunk_id": i, "source": os.path.basename(pdf_path)}],
                    ids=[f"chunk_{i}"]
                )
            
            return fallback_name, full_text
        except Exception as e2:
            print(f"Fatal error with ChromaDB: {e2}")
            raise

# ROUGE Score Implementation
def calculate_rouge_scores(reference_text, generated_text):
    """
    Calculate ROUGE-N scores for n=1, n=2, and ROUGE-L score.
    
    Args:
        reference_text (str): The original text
        generated_text (str): The generated text to evaluate
        
    Returns:
        dict: Dictionary containing ROUGE-1, ROUGE-2, and ROUGE-L scores
    """
    # Ensure NLTK punkt tokenizer is available
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    # Tokenize texts
    reference_tokens = nltk.word_tokenize(reference_text.lower())
    generated_tokens = nltk.word_tokenize(generated_text.lower())
    
    # ROUGE-1 (unigrams)
    reference_unigrams = Counter(reference_tokens)
    generated_unigrams = Counter(generated_tokens)
    common_unigrams = reference_unigrams & generated_unigrams
    
    rouge_1_precision = sum(common_unigrams.values()) / max(1, sum(generated_unigrams.values()))
    rouge_1_recall = sum(common_unigrams.values()) / max(1, sum(reference_unigrams.values()))
    rouge_1_f1 = 2 * rouge_1_precision * rouge_1_recall / max(1e-10, rouge_1_precision + rouge_1_recall)
    
    # ROUGE-2 (bigrams)
    reference_bigrams = Counter(list(ngrams(reference_tokens, 2)))
    generated_bigrams = Counter(list(ngrams(generated_tokens, 2)))
    common_bigrams = reference_bigrams & generated_bigrams
    
    rouge_2_precision = sum(common_bigrams.values()) / max(1, sum(generated_bigrams.values()))
    rouge_2_recall = sum(common_bigrams.values()) / max(1, sum(reference_bigrams.values()))
    rouge_2_f1 = 2 * rouge_2_precision * rouge_2_recall / max(1e-10, rouge_2_precision + rouge_2_recall)
    
    # ROUGE-L (Longest Common Subsequence)
    lcs_length = compute_lcs_length(reference_tokens, generated_tokens)
    
    rouge_l_precision = lcs_length / max(1, len(generated_tokens))
    rouge_l_recall = lcs_length / max(1, len(reference_tokens))
    rouge_l_f1 = 2 * rouge_l_precision * rouge_l_recall / max(1e-10, rouge_l_precision + rouge_l_recall)
    
    return {
        "ROUGE-1": {
            "precision": rouge_1_precision,
            "recall": rouge_1_recall,
            "f1": rouge_1_f1
        },
        "ROUGE-2": {
            "precision": rouge_2_precision,
            "recall": rouge_2_recall,
            "f1": rouge_2_f1
        },
        "ROUGE-L": {
            "precision": rouge_l_precision,
            "recall": rouge_l_recall,
            "f1": rouge_l_f1
        }
    }

def compute_lcs_length(X, Y):
    """Compute the length of the Longest Common Subsequence (LCS) between two token lists"""
    m, n = len(X), len(Y)
    
    # Create LCS matrix
    L = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:
                L[i][j] = L[i-1][j-1] + 1
            else:
                L[i][j] = max(L[i-1][j], L[i][j-1])
    
    return L[m][n]

# Function to find relevant chunks for evaluation
def get_relevant_chunks_for_query(query, full_text, window_size=3000):
    """
    Find the most relevant chunk of text from the PDF for the given query.
    This provides context for ROUGE evaluation.
    """
    # Ensure NLTK punkt tokenizer is available
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    
    # Tokenize the full text
    sentences = nltk.sent_tokenize(full_text)
    
    # Create sliding windows of sentences to find the most relevant section
    chunks = []
    for i in range(0, len(sentences), window_size // 50):  # Approximate words per sentence
        end_idx = min(i + window_size // 50, len(sentences))
        chunk = " ".join(sentences[i:end_idx])
        chunks.append(chunk)
    
    # Basic keyword matching to find most relevant chunk (simplified approach)
    query_keywords = set(nltk.word_tokenize(query.lower()))
    
    best_score = 0
    best_chunk = ""
    
    for chunk in chunks:
        chunk_tokens = set(nltk.word_tokenize(chunk.lower()))
        overlap = len(query_keywords.intersection(chunk_tokens))
        score = overlap / max(1, len(query_keywords))
        
        if score > best_score:
            best_score = score
            best_chunk = chunk
    
    # If no good match found, return the first chunk
    if best_score < 0.1 and chunks:
        return chunks[0]
    
    return best_chunk

# Function to retrieve relevant content using keywords
def retrieve_relevant_chunks(query, collection_name, n_results=10):
    """Retrieve the most relevant chunks for a given query from a specific collection."""
    try:
        # Get the specific collection for this PDF
        collection = chroma_client.get_collection(name=collection_name)
        
        results = collection.query(
            query_texts=[query], 
            n_results=n_results
        )
        
        if results["documents"] and len(results["documents"][0]) > 0:
            # Return all retrieved chunks
            return results["documents"][0]
        else:
            return ["No relevant content found."]
            
    except Exception as e:
        print(f"Error retrieving chunks: {e}")
        return ["Error retrieving content."]

# Enhance query with specific focus areas
def enhance_query(section, query):
    """Add context to the query based on the section."""
    enhancements = {
        "Title & Introduction": "Provide a natural flowing paragraph about the document title, research questions, and introductory context. Do not use any subsection headers or asterisks.",
        "Findings & Results": "Extract specific outcomes, measurements, and discoveries in a narrative format. Include key statistics if available. Avoid using subsection headers or question-answer format.",
        "Methods & Approaches": "Describe the specific techniques, algorithms, experiments, or methodological approaches in flowing paragraphs. Do not use subsection markers or bullet points.",
    }
    
    if section in enhancements:
        return f"{query}\n\nAdditional guidance: {enhancements[section]}"
    return query

# Function to generate naturally flowing sections with improved prompting
def generate_presentation_sections(pdf_path):
    structured_presentation = {}
    rouge_scores = {}  # To store ROUGE scores for each section
    
    # Process PDF and get collection name specific to this PDF
    collection_name, full_text = store_pdf_in_chromadb(pdf_path)
    
    # Generate each section
    for section, base_query in PRESENTATION_QUERIES.items():
        print(f"Processing section: {section}")
        
        # Enhance the query with section-specific guidance
        enhanced_query = enhance_query(section, base_query)
        
        # Retrieve relevant content from the PDF-specific collection
        retrieved_chunks = retrieve_relevant_chunks(enhanced_query, collection_name)
        
        # Skip if no relevant content
        if not retrieved_chunks or all(chunk == "No relevant content found." for chunk in retrieved_chunks):
            structured_presentation[section] = "No relevant information found in the document."
            rouge_scores[section] = {
                "ROUGE-1": {"f1": 0.0},
                "ROUGE-2": {"f1": 0.0},
                "ROUGE-L": {"f1": 0.0}
            }
            continue
        
        # Combine chunks for context but limit total size
        combined_text = " ".join(retrieved_chunks)
        if len(combined_text) > 4000:  # Limit context size for LLM
            combined_text = combined_text[:4000] + "..."
        
        # Craft detailed prompt for each section with narrative flow instructions
        prompt = f"""
        I need to extract content for the "{section}" section of a presentation based on a scientific document.

        Query: {base_query}
        
        Document content:
        {combined_text}
        
        Create 2-3 naturally flowing paragraphs that address the query. IMPORTANT:
        1. DO NOT use subsection headers with asterisks (like **Title** or **Main Research Question**)
        2. DO NOT format as question-answer pairs
        3. Create smooth transitions between ideas
        4. Maintain an academic but narrative tone
        5. Only include information from the document
        6. MAXIMIZE the use of KEY TERMINOLOGY from the original text
        7. Preserve as many of the original phrases and technical terms as possible
        8. Ensure comprehensive coverage of all main points relevant to the query
        
        The output should read like a coherent mini-essay suitable for a presentation slide.
        """
        
        # Throttle API calls to prevent rate limiting
        time.sleep(0.5)
        
        # Get response from LLM
        try:
            response = ollama.chat(
                model="llama3.1",
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": prompt}
                ]
            )["message"]["content"]
            
            # Clean and format the response
            cleaned_response = clean_text(response)
            
            # Remove any "based on the document" phrases and LLM disclaimers
            cleaned_response = re.sub(r'Based on the (provided|document|given|available) (content|text|information|document)', '', cleaned_response)
            cleaned_response = re.sub(r'From the (provided|document|given|available) (content|text|information|document)', '', cleaned_response)
            cleaned_response = re.sub(r'According to the (provided|document|given|available) (content|text|information|document)', '', cleaned_response)
            cleaned_response = re.sub(r'The document (states|mentions|indicates|suggests|notes|describes|discusses|presents|shows|reports|provides|explains)', '', cleaned_response)
            
            # Remove any model disclaimers
            cleaned_response = re.sub(r'I don\'t have enough information to.*', '', cleaned_response)
            cleaned_response = re.sub(r'The text doesn\'t (specify|mention|provide|include).*', '', cleaned_response)
            
            # Remove any remaining double asterisks subsection markers
            cleaned_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', cleaned_response)
            
            # Final cleanup and store
            structured_presentation[section] = cleaned_response.strip()
            
            # Find the most relevant section in the original text for ROUGE evaluation
            relevant_reference_text = get_relevant_chunks_for_query(base_query, full_text)
            
            # Calculate ROUGE scores
            section_rouge_scores = calculate_rouge_scores(relevant_reference_text, cleaned_response)
            rouge_scores[section] = section_rouge_scores
            
            # Print ROUGE scores for this section
            print(f"  ROUGE scores for '{section}':")
            print(f"    ROUGE-1 F1: {section_rouge_scores['ROUGE-1']['f1']:.4f}")
            print(f"    ROUGE-2 F1: {section_rouge_scores['ROUGE-2']['f1']:.4f}")
            print(f"    ROUGE-L F1: {section_rouge_scores['ROUGE-L']['f1']:.4f}")
            
        except Exception as e:
            print(f"Error generating content for '{section}': {e}")
            structured_presentation[section] = f"Error extracting content for this section: {str(e)}"
            rouge_scores[section] = {
                "ROUGE-1": {"f1": 0.0},
                "ROUGE-2": {"f1": 0.0},
                "ROUGE-L": {"f1": 0.0}
            }
    
    return structured_presentation, rouge_scores

# Main execution
def main():
    # Define PDF path
    pdf_path = r'C:\Users\pandl\OneDrive\Desktop\FYP\Sarcoma.pdf'
    
    # Generate structured presentation content with ROUGE evaluation
    presentation_content, rouge_scores = generate_presentation_sections(pdf_path)
    
    # Calculate average ROUGE scores across all sections
    avg_rouge_1 = sum(score["ROUGE-1"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
    avg_rouge_2 = sum(score["ROUGE-2"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
    avg_rouge_l = sum(score["ROUGE-L"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
    
    print("\n===== ROUGE Score Summary =====")
    print(f"Average ROUGE-1 F1: {avg_rouge_1:.4f}")
    print(f"Average ROUGE-2 F1: {avg_rouge_2:.4f}")
    print(f"Average ROUGE-L F1: {avg_rouge_l:.4f}")
    print("===============================")
    
    # Save structured content and ROUGE scores
    output_file = r"C:\Users\pandl\OneDrive\Desktop\FYP\presentation_content.txt"
    rouge_output_file = r"C:\Users\pandl\OneDrive\Desktop\FYP\rouge_scores.txt"
    
    with open(output_file, "w", encoding="utf-8") as f:
        for section, content in presentation_content.items():
            print(f"\n🔹 {section}:\n{content}\n")
            f.write(f"{section}:\n{content}\n\n")
    
    with open(rouge_output_file, "w", encoding="utf-8") as f:
        f.write("===== ROUGE Score Summary =====\n")
        f.write(f"Average ROUGE-1 F1: {avg_rouge_1:.4f}\n")
        f.write(f"Average ROUGE-2 F1: {avg_rouge_2:.4f}\n")
        f.write(f"Average ROUGE-L F1: {avg_rouge_l:.4f}\n")
        f.write("===============================\n\n")
        
        for section, scores in rouge_scores.items():
            f.write(f"{section}:\n")
            f.write(f"  ROUGE-1: P={scores['ROUGE-1']['precision']:.4f}, R={scores['ROUGE-1']['recall']:.4f}, F1={scores['ROUGE-1']['f1']:.4f}\n")
            f.write(f"  ROUGE-2: P={scores['ROUGE-2']['precision']:.4f}, R={scores['ROUGE-2']['recall']:.4f}, F1={scores['ROUGE-2']['f1']:.4f}\n")
            f.write(f"  ROUGE-L: P={scores['ROUGE-L']['precision']:.4f}, R={scores['ROUGE-L']['recall']:.4f}, F1={scores['ROUGE-L']['f1']:.4f}\n\n")
    
    print(f"\n✅ Extracted structured content saved to {output_file}")
    print(f"✅ ROUGE scores saved to {rouge_output_file}")

# Support processing multiple PDFs
def process_multiple_pdfs(pdf_paths):
    """Process multiple PDFs and save individual results"""
    all_rouge_scores = {}
    
    for pdf_path in pdf_paths:
        # Create output filename based on PDF name
        pdf_basename = os.path.basename(pdf_path).split('.')[0]
        output_file = f"{os.path.dirname(pdf_path)}/{pdf_basename}_presentation.txt"
        rouge_output_file = f"{os.path.dirname(pdf_path)}/{pdf_basename}_rouge_scores.txt"
        
        print(f"\n📄 Processing PDF: {pdf_path}")
        
        # Generate content for this specific PDF with ROUGE evaluation
        presentation_content, rouge_scores = generate_presentation_sections(pdf_path)
        all_rouge_scores[pdf_basename] = rouge_scores
        
        # Calculate average ROUGE scores for this PDF
        avg_rouge_1 = sum(score["ROUGE-1"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
        avg_rouge_2 = sum(score["ROUGE-2"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
        avg_rouge_l = sum(score["ROUGE-L"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
        
        print(f"\n===== ROUGE Score Summary for {pdf_basename} =====")
        print(f"Average ROUGE-1 F1: {avg_rouge_1:.4f}")
        print(f"Average ROUGE-2 F1: {avg_rouge_2:.4f}")
        print(f"Average ROUGE-L F1: {avg_rouge_l:.4f}")
        
        # Save structured content
        with open(output_file, "w", encoding="utf-8") as f:
            for section, content in presentation_content.items():
                print(f"\n🔹 {section}:\n{content}\n")
                f.write(f"{section}:\n{content}\n\n")
        
        # Save ROUGE scores
        with open(rouge_output_file, "w", encoding="utf-8") as f:
            f.write(f"===== ROUGE Score Summary for {pdf_basename} =====\n")
            f.write(f"Average ROUGE-1 F1: {avg_rouge_1:.4f}\n")
            f.write(f"Average ROUGE-2 F1: {avg_rouge_2:.4f}\n")
            f.write(f"Average ROUGE-L F1: {avg_rouge_l:.4f}\n")
            f.write("===============================\n\n")
            
            for section, scores in rouge_scores.items():
                f.write(f"{section}:\n")
                f.write(f"  ROUGE-1: P={scores['ROUGE-1']['precision']:.4f}, R={scores['ROUGE-1']['recall']:.4f}, F1={scores['ROUGE-1']['f1']:.4f}\n")
                f.write(f"  ROUGE-2: P={scores['ROUGE-2']['precision']:.4f}, R={scores['ROUGE-2']['recall']:.4f}, F1={scores['ROUGE-2']['f1']:.4f}\n")
                f.write(f"  ROUGE-L: P={scores['ROUGE-L']['precision']:.4f}, R={scores['ROUGE-L']['recall']:.4f}, F1={scores['ROUGE-L']['f1']:.4f}\n\n")
        
        print(f"\n✅ Extracted content saved to {output_file}")
        print(f"✅ ROUGE scores saved to {rouge_output_file}")
    
    # Generate comparative report for multiple PDFs
    if len(pdf_paths) > 1:
        comparative_report_file = f"{os.path.dirname(pdf_paths[0])}/comparative_rouge_report.txt"
        
        with open(comparative_report_file, "w", encoding="utf-8") as f:
            f.write("===== Comparative ROUGE Score Report =====\n\n")
            
            # Calculate and write overall averages
            for pdf_basename, rouge_scores in all_rouge_scores.items():
                avg_rouge_1 = sum(score["ROUGE-1"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
                avg_rouge_2 = sum(score["ROUGE-2"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
                avg_rouge_l = sum(score["ROUGE-L"]["f1"] for score in rouge_scores.values()) / len(rouge_scores)
                
                f.write(f"Document: {pdf_basename}\n")
                f.write(f"  Average ROUGE-1 F1: {avg_rouge_1:.4f}\n")
                f.write(f"  Average ROUGE-2 F1: {avg_rouge_2:.4f}\n")
                f.write(f"  Average ROUGE-L F1: {avg_rouge_l:.4f}\n\n")
        
        print(f"\n✅ Comparative ROUGE report saved to {comparative_report_file}")

if __name__ == "__main__":
    main()
    
    # Example of processing multiple PDFs:
    # pdf_paths = [
    #     r'C:\Users\pandl\OneDrive\Desktop\FYP\Stock_Market_Prediction.pdf',
    #     r'C:\Users\pandl\OneDrive\Desktop\FYP\Climate_Change_Research.pdf'
    # ]
    # process_multiple_pdfs(pdf_paths)

Processing PDF: C:\Users\pandl\OneDrive\Desktop\FYP\Sarcoma.pdf
Error handling collection: In Chroma v0.6.0, list_collections only returns collection names. Use Client.get_collection(doc_collection_864eccd4ee_1745473552) to access name. See https://docs.trychroma.com/deployment/migration for more information.
Attempting fallback with collection name: doc_collection_864eccd4ee_1745474897
Processing section: Title & Introduction
  ROUGE scores for 'Title & Introduction':
    ROUGE-1 F1: 0.2302
    ROUGE-2 F1: 0.0928
    ROUGE-L F1: 0.1178
Processing section: Key Topics & Sections
  ROUGE scores for 'Key Topics & Sections':
    ROUGE-1 F1: 0.2619
    ROUGE-2 F1: 0.0930
    ROUGE-L F1: 0.1187
Processing section: Definitions & Key Terms
  ROUGE scores for 'Definitions & Key Terms':
    ROUGE-1 F1: 0.2032
    ROUGE-2 F1: 0.0404
    ROUGE-L F1: 0.0991
Processing section: Methods & Approaches
  ROUGE scores for 'Methods & Approaches':
    ROUGE-1 F1: 0.1960
    ROUGE-2 F1: 0.0676
    ROUGE-L F