In [3]:
from pptx import Presentation
from pptx.util import Pt, Inches
from pptx.enum.text import MSO_AUTO_SIZE, PP_ALIGN
import re
import os
import ollama
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json


# Download NLTK resources if not already available
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Define file paths
content_file_path = r"C:\Users\pandl\OneDrive\Desktop\FYP\presentation_content.txt"
pptx_output_path = r"C:\Users\pandl\OneDrive\Desktop\FYP\Generated_Presentation_new.pptx"
image_folder = r"C:\Users\pandl\OneDrive\Desktop\FYP\Extracted"

# Constants
MAX_CHARS_PER_SLIDE = 500  # Text limit per slide for readability
BODY_FONT_SIZE = Pt(18)    # Standardized body font size
IMAGE_WIDTH = Inches(4)    # Standard image width for presentations
MAX_IMAGES_PER_SLIDE = 1   # Maximum number of images per slide

# Sections to exclude entirely from the presentation
SECTIONS_TO_EXCLUDE = [
    "Definition", "Key Part", "Key Parts", "Definitions", 
    "Definitions and key", "Definitions and Key", 
    "Definition and key", "Definition and Key"
]

# Title renaming mappings
TITLE_RENAMES = {
    "Key topics and": "Key Topics",
    "Important Statistics and": "Important Statistics",
    "Future Scope and": "Future Scope"
}

def describe_images_in_folder(folder_path):
    """Get detailed descriptions of images with improved prompting for technical diagrams."""
    image_descriptions = {}

    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)

        if image_name.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp")):
            print(f"Processing: {image_name}")

            try:
                # Enhanced prompt specifically for technical presentations
                response = ollama.chat(
                    model="llava",
                    messages=[{
                        "role": "user",
                        "content": """Please analyze this image in detail for a technical presentation:
1. CONTENT TYPE: Is this a diagram/flowchart, graph/chart, architectural design, conceptual illustration, or something else?
2. SECTION CATEGORIZATION: Would this image best fit in (Introduction/Background, Methodology, Results/Findings, or Key Topics)?
3. TECHNICAL ELEMENTS: What specific technical elements, components, or data are shown?
4. VISIBLE TEXT: List any important text visible in the image.
5. TECHNICAL DOMAIN: What scientific or technical domain does this relate to (ML/AI, medical, engineering, etc.)?""",
                        "images": [image_path]
                    }],
                )

                description = response.get("message", {}).get("content", "").strip()
                image_descriptions[image_name] = description

            except Exception as e:
                print(f"❌ Error processing {image_name}: {e}")
                image_descriptions[image_name] = "Error processing image"

    return image_descriptions

def categorize_image_by_type(description):
    """Categorize images into types based on their descriptions."""
    categories = {
        "methodology": ["architecture", "diagram", "workflow", "process", "flowchart", 
                       "pipeline", "framework", "system design", "steps", "algorithm", 
                       "method", "approach", "procedure", "implementation", "structure",
                       "design", "model architecture", "model structure", "component"],
        "results": ["graph", "plot", "chart", "result", "performance", "accuracy", 
                   "metric", "evaluation", "comparison", "outcome", "data visualization"],
        "introduction": ["concept", "overview", "introduction", "background"],
        "key_topics": ["key", "topics", "important", "main", "highlight"]
    }
    
    # Score each category
    scores = {category: 0 for category in categories}
    
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in description.lower():
                scores[category] += 1
                
                # Give higher weight to exact matches for methodology diagrams
                if category == "methodology" and keyword in ["diagram", "architecture", "workflow", "framework"]:
                    scores[category] += 2
    
    # Get the highest scoring category
    best_category = max(scores.items(), key=lambda x: x[1])
    
    # Only return if score is above threshold
    if best_category[1] > 0:
        return best_category[0]
    else:
        return "general"  # Default category

def extract_keywords(text, importance_multiplier=1):
    """Extract weighted keywords from text, handling domain-specific terminology better."""
    # Clean text
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    
    # Split into words
    words = text.split()
    
    # Extended stopwords - very comprehensive
    extended_stopwords = {
        'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'when', 
        'where', 'how', 'why', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 
        'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
        'having', 'do', 'does', 'did', 'doing', 'to', 'from', 'by', 'for', 'with', 'about', 
        'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
        'below', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 
        'there', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
        'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 
        'can', 'will', 'just', 'should', 'now', 'would', 'could', 'may', 'might'
    }
    
    # Define domain-specific keywords with higher weights
    domain_keywords = {
        # Medical/health terms
        'cancer': 3, 'tumor': 3, 'sarcoma': 3, 'tissue': 2, 'patient': 2, 'clinical': 2,
        'medical': 2, 'health': 2, 'diagnosis': 3, 'treatment': 2, 'therapy': 2,
        'prognosis': 3, 'survival': 3, 'pathology': 3, 'oncology': 3,
        
        # Machine learning/data science terms
        'algorithm': 3, 'model': 2, 'classifier': 3, 'classification': 3, 'prediction': 2,
        'accuracy': 2, 'precision': 2, 'recall': 2, 'feature': 2, 'training': 2,
        'validation': 2, 'testing': 2, 'dataset': 2, 'data': 2, 'machine': 2,
        'learning': 2, 'neural': 3, 'network': 2, 'deep': 2, 'ensemble': 3,
        
        # Research/academic terms
        'research': 2, 'study': 2, 'analysis': 2, 'results': 2, 'method': 2,
        'methodology': 2, 'conclusion': 2, 'findings': 2, 'literature': 2,
        'review': 2, 'paper': 2, 'publication': 2, 'journal': 2, 'statistical': 2
    }
    
    # Create keyword dictionary with weights
    keywords = {}
    for word in words:
        if word not in extended_stopwords and len(word) > 2:
            # Check if it's a domain keyword (with higher weight)
            if word in domain_keywords:
                weight = domain_keywords[word] * importance_multiplier
            else:
                weight = 1 * importance_multiplier
                
            keywords[word] = keywords.get(word, 0) + weight
    
    return keywords

def calculate_image_relevance(slide_title, slide_content, image_description):
    """Calculate how relevant an image is to a slide using multiple techniques."""
    # Extract keywords with different weights (title is more important)
    title_keywords = extract_keywords(slide_title, importance_multiplier=3)
    content_keywords = extract_keywords(slide_content, importance_multiplier=1)
    image_keywords = extract_keywords(image_description, importance_multiplier=1)
    
    # Combine slide keywords
    slide_keywords = {}
    for word, weight in title_keywords.items():
        slide_keywords[word] = weight
    for word, weight in content_keywords.items():
        if word in slide_keywords:
            slide_keywords[word] += weight
        else:
            slide_keywords[word] = weight
    
    # Calculate relevance score
    relevance_score = 0
    matched_keywords = []
    
    for word, slide_weight in slide_keywords.items():
        if word in image_keywords:
            # The score is the product of the weights from both sources
            keyword_score = slide_weight * image_keywords[word]
            relevance_score += keyword_score
            matched_keywords.append((word, keyword_score))
    
    # Normalize score based on number of keywords (to prevent bias toward longer texts)
    normalization_factor = max(len(slide_keywords), 1)
    normalized_score = relevance_score / normalization_factor
    
    # Get top matched keywords for debugging
    matched_keywords.sort(key=lambda x: x[1], reverse=True)
    top_matches = matched_keywords[:5] if matched_keywords else []
    
    return normalized_score, top_matches

def find_best_image_for_slide(slide_title, slide_content, image_descriptions, used_images):
    """Find the most relevant image for a slide with improved section-based matching."""
    # Skip if no content
    if not slide_title.strip() or not slide_content.strip():
        return None
    
    # Determine section type from slide title - with more flexible matching
    section_type = "general"
    section_title_lower = slide_title.lower()
    
    # More flexible methodology section detection
    if any(term in section_title_lower for term in ["methodology", "method", "approach", "process", "procedure", "implementation"]):
        section_type = "methodology"
    elif any(term in section_title_lower for term in ["result", "finding", "outcome", "performance"]):
        section_type = "results"
    elif any(term in section_title_lower for term in ["introduction", "background", "overview"]):
        section_type = "introduction"
    elif any(term in section_title_lower for term in ["key topic", "main point", "highlight"]):
        section_type = "key_topics"
    
    # Also check content for methodology terms if section_type is still general
    if section_type == "general":
        if any(term in slide_content.lower() for term in ["methodology", "method", "approach", "process", "procedure"]):
            section_type = "methodology"
    
    # Filter out used images
    available_images = {img: desc for img, desc in image_descriptions.items() if img not in used_images}
    if not available_images:
        return None
    
    # Track best match
    best_match = None
    best_score = 0
    best_methodology_match = None
    best_methodology_score = 0
    
    # First prioritize matching by section type
    for img_name, img_desc in available_images.items():
        # Categorize the image
        img_category = categorize_image_by_type(img_desc)
        
        # Calculate base relevance using existing method
        base_score, top_matches = calculate_image_relevance(slide_title, slide_content, img_desc)
        
        # Boost score if category matches section type
        final_score = base_score
        if img_category == section_type:
            final_score *= 2.0  # Double the score for matching categories
            
        # Special case for diagrams in methodology - with higher boost
        if section_type == "methodology" and any(kw in img_desc.lower() for kw in 
                                              ["diagram", "architecture", "workflow", "framework", "process"]):
            final_score *= 2.0  # Increase the boost for diagrams in methodology
            
            # Track best methodology diagram separately
            if final_score > best_methodology_score:
                best_methodology_score = final_score
                best_methodology_match = img_name
            
        # Update best match if this is better
        if final_score > best_score:
            best_score = final_score
            best_match = img_name
    
    # Use a lower threshold for methodology sections to ensure diagrams get used
    threshold = 0.5
    if section_type == "methodology":
        threshold = 0.3  # Lower threshold for methodology sections
        
        # If we found a good methodology diagram, use it
        if best_methodology_match and best_methodology_score > 0.2:
            print(f"Selected methodology diagram '{best_methodology_match}' for slide '{slide_title}'")
            return best_methodology_match
    
    if best_score >= threshold:
        print(f"Selected image '{best_match}' for slide '{slide_title}'")
        print(f"  Section type: {section_type}, Score: {best_score:.2f}")
        return best_match
    else:
        print(f"No sufficiently relevant image found for slide '{slide_title}'")
        return None
    
def validate_image_assignment(section_title, image_name, image_description):
    """Validate that the image is appropriate for the section."""
    section_lower = section_title.lower()
    
    # More flexible matching for methodology sections
    is_methodology_section = any(term in section_lower for term in 
                               ["methodology", "method", "approach", "process", "procedure", "implementation"])
    
    # Architecture diagrams should go in methodology sections, but with more flexibility
    if ("architecture" in image_description.lower() or 
        "diagram" in image_description.lower() or 
        "workflow" in image_description.lower() or
        "framework" in image_description.lower() or
        "process" in image_description.lower()):
        
        # Allow diagrams in methodology sections with more flexible matching
        if not is_methodology_section:
            print(f"Note: Architecture/diagram image '{image_name}' would be better in a methodology section, but allowing it.")
            # Still allow the assignment, just with a warning
            return True
    
    # Results visualizations should preferably go in results sections, but allow flexibility
    if ("graph" in image_description.lower() or 
        "chart" in image_description.lower() or 
        "plot" in image_description.lower()):
        
        # Just log a warning but don't block assignment
        if "result" not in section_lower and "finding" not in section_lower:
            print(f"Note: Graph/chart image '{image_name}' might not be ideal for '{section_title}', but allowing it.")
    
    return True

def preprocess_text(text):
    """Preprocess text without relying on NLTK."""
    # Convert to lowercase
    text = text.lower()
    
    # Simple tokenization using regex
    import re
    tokens = re.findall(r'\b[a-z]+\b', text)
    
    # Basic stopwords list (you can expand this)
    basic_stopwords = {'the', 'a', 'an', 'and', 'is', 'are', 'in', 'of', 'to', 'for', 'on', 'with', 'at', 'by', 'from', 'this', 'that', 'it', 'as'}
    
    # Filter out stopwords
    tokens = [word for word in tokens if word not in basic_stopwords]
    
    return " ".join(tokens)

def rename_title(title):
    """Renames specific titles according to predefined mapping."""
    for old_title, new_title in TITLE_RENAMES.items():
        if old_title in title:
            return title.replace(old_title, new_title)
    return title

def read_presentation_content(file_path):
    """Reads structured content from a text file and returns a dictionary."""
    presentation_content = {}
    current_section = None
    content_lines = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # Ignore empty lines
                if line.endswith(":"):  # Identify section headers
                    if current_section:  # Save the previous section
                        # Check if the current section should be excluded based on exact match or substring
                        should_exclude = False
                        for exclude in SECTIONS_TO_EXCLUDE:
                            if exclude.lower() in current_section.lower():
                                should_exclude = True
                                break
                        
                        if not should_exclude:
                            # Apply title renaming before adding to presentation content
                            renamed_section = rename_title(current_section)
                            presentation_content[renamed_section] = "\n".join(content_lines)
                    
                    current_section = line[:-1]  # Remove the ":" at the end
                    content_lines = []  # Reset content list
                else:
                    content_lines.append(line)

        # Save the last section
        if current_section:
            should_exclude = False
            for exclude in SECTIONS_TO_EXCLUDE:
                if exclude.lower() in current_section.lower():
                    should_exclude = True
                    break
            
            if not should_exclude:
                # Apply title renaming before adding to presentation content
                renamed_section = rename_title(current_section)
                presentation_content[renamed_section] = "\n".join(content_lines)

    return presentation_content

def clean_title(title):
    """Removes ampersand signs and other unwanted characters from title."""
    # First apply any required renaming
    title = rename_title(title)
    
    # Remove ampersand signs
    cleaned_title = title.replace("&", "and")
    
    # Remove other potential special characters if needed
    cleaned_title = re.sub(r'[^\w\s\-.,:]', '', cleaned_title)
    
    return cleaned_title.strip()

def generate_short_title(long_title):
    """Creates a meaningful short title by extracting key terms."""
    # First clean the title
    clean_long_title = clean_title(long_title)
    
    common_keywords = ["introduction", "background", "conclusion", "references", "summary", "methodology", "results", "discussion", "findings"]
    
    words = clean_long_title.split()
    
    # If title has a common keyword, return it
    for word in words:
        if word.lower() in common_keywords:
            return word.capitalize()
    
    # Otherwise, take the first 3 words and make a short phrase
    return " ".join(words[:3])

def split_text(text, max_chars):
    """Splits text into chunks that fit within the character limit per slide."""
    # Split by paragraphs first
    paragraphs = text.split("\n")
    slides_content = []
    current_slide_text = ""
    
    for para in paragraphs:
        # If adding this paragraph exceeds the limit
        if len(current_slide_text) + len(para) + 1 > max_chars and current_slide_text:
            slides_content.append(current_slide_text.strip())
            current_slide_text = para
        else:
            if current_slide_text:
                current_slide_text += "\n" + para
            else:
                current_slide_text = para
    
    # Don't forget the last slide
    if current_slide_text.strip():
        slides_content.append(current_slide_text.strip())
        
    return slides_content

def split_into_sentences(text):
    """Split text into sentences using common end-of-sentence markers."""
    # Define sentence delimiters
    sentence_ends = ['. ', '! ', '? ', '.\n', '!\n', '?\n']
    sentences = []
    current_sentence = ""
    
    i = 0
    while i < len(text):
        current_sentence += text[i]
        
        # Check if we're at a sentence boundary
        for end in sentence_ends:
            if i + len(end) <= len(text) and text[i:i+len(end)] == end:
                sentences.append(current_sentence.strip())
                current_sentence = ""
                i += len(end) - 1  # -1 because we'll increment i again at the end of the loop
                break
                
        i += 1
    
    # Add any remaining text
    if current_sentence.strip():
        sentences.append(current_sentence.strip())
        
    return sentences

def filter_content_by_section(section_title, content):
    """Filter content to ensure it's appropriate for the section title."""
    # Convert section title to lowercase for easier matching
    section_lower = section_title.lower()
    
    # Define patterns that should be excluded from specific sections
    exclude_patterns = {
        "introduction": [
            r"key find(ing|ings)",
            r"result(s)?( show)?",
            r"conclusion",
            r"we conclude"
        ],
        "background": [
            r"key find(ing|ings)",
            r"result(s)?( show)?",
            r"conclusion",
            r"we conclude"
        ],
        "methodology": [
            r"key find(ing|ings)",
            r"conclusion",
            r"we conclude"
        ],
        "conclusion": [
            r"introduce",
            r"background"
        ]
    }
    
    # Additional patterns to exclude definitions and key parts from any section
    definition_patterns = [
        r"is defined as",
        r"refers to",
        r"can be defined as",
        r"is a term used to describe",
        r"means",
        r"the definition of",
        r"key (concept|part|element|component)"
    ]
    
    # Add definition patterns to all sections
    for section_key in exclude_patterns.keys():
        exclude_patterns[section_key].extend(definition_patterns)
    
    # Determine which section type this is
    section_type = None
    for key in exclude_patterns.keys():
        if key in section_lower:
            section_type = key
            break
    
    # If no specific section type identified, still filter for definitions
    if not section_type:
        section_type = "general"
        exclude_patterns["general"] = definition_patterns
    
    # Split into sentences to filter at the sentence level
    sentences = split_into_sentences(content)
    filtered_sentences = []
    
    for sentence in sentences:
        exclude = False
        for pattern in exclude_patterns[section_type]:
            if re.search(pattern, sentence, re.IGNORECASE):
                exclude = True
                break
        
        if not exclude:
            filtered_sentences.append(sentence)
    
    # Rebuild the text
    return " ".join(filtered_sentences)

def clean_content(text):
    """Remove meta-commentary, unnecessary phrases, asterisks, definitions, and key parts from the content."""
    # List of phrases to remove
    phrases_to_remove = [
        "The provided text appears to be",
        "This text appears to be",
        "The text you provided appears to be",
        "It appears that you've provided",
        "Here are some key points that can be identified from the text",
        "Here's a brief summary",
        "Here's a breakdown of the content",
        "Here are the key findings",
        "I'll provide a summary and highlight the main points",
        "Overall, the text provides",
        "Overall, this paper presents",
        "Here's a summary of the main points",
        "Here are some key points extracted from the text",
        "Some possible limitations or areas for further investigation include",
        "Some key contributions of the proposed research include",
        "Some potential improvements or follow-up questions based on this text could include"
    ]
    
    # Remove phrases
    clean_text = text
    for phrase in phrases_to_remove:
        clean_text = clean_text.replace(phrase, "")
    
    # Remove sentences containing meta-commentary
    sentences = split_into_sentences(clean_text)
    filtered_sentences = []
    
    meta_patterns = [
        r"The (text|document|paper|provided text) (appears to be|is|discusses|highlights|covers|reviews|presents)",
        r"It (appears|seems) (that|like)",
        r"The (author|authors) (provides|provide|highlights|highlight|discusses|discuss|aims|aim|mentions|mention)",
        r"This (appears|seems) to be",
        r"(From|Based on) (the|this) (text|content|document)"
    ]
    
    # Additional patterns to filter out definitions and key parts
    definition_key_patterns = [
        r"is defined as",
        r"refers to",
        r"can be defined as",
        r"is a term used to describe",
        r"means",
        r"the definition of",
        r"key (concept|part|element|component)"
    ]
    
    meta_patterns.extend(definition_key_patterns)
    
    for sentence in sentences:
        is_meta = False
        for pattern in meta_patterns:
            if re.search(pattern, sentence, re.IGNORECASE):
                is_meta = True
                break
        
        if not is_meta:
            filtered_sentences.append(sentence)
    
    # Rebuild the text
    clean_text = " ".join(filtered_sentences)
    
    # Remove double spaces and clean up
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    
    # Remove ALL asterisks (both standalone and those used as bullet points)
    clean_text = clean_text.replace("*", "")
    
    # Remove any remaining references to "the text" or "the document"
    clean_text = re.sub(r'\b(the text|the document)\b', '', clean_text, flags=re.IGNORECASE)
    
    return clean_text

def prepare_content_for_slides(section_title, text):
    """Clean the content and convert it to separate paragraphs, filtering by section type."""
    # First, clean the content and remove asterisks
    text = clean_content(text)
    
    # Filter content based on section type
    text = filter_content_by_section(section_title, text)
    
    # Split into sentences
    sentences = split_into_sentences(text)
    
    # Group sentences into paragraphs based on context
    paragraphs = []
    current_paragraph = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Skip very short sentences or fragments
        if len(sentence.split()) < 3:
            continue
        
        # Add to current paragraph
        if current_paragraph:
            current_paragraph += " " + sentence
        else:
            current_paragraph = sentence
            
        # If sentence is conceptually complete (based on length or ending), start a new paragraph
        if len(current_paragraph.split()) > 20 or sentence.endswith('.'):
            paragraphs.append(current_paragraph)
            current_paragraph = ""
    
    # Add any remaining content
    if current_paragraph:
        paragraphs.append(current_paragraph)
    
    # Return as separate lines
    return "\n".join(paragraphs)

def add_slide(prs, title_text, content_text, image_descriptions, used_images):
    """Adds one or more slides with content and relevant images, ensuring consistent formatting."""
    # Apply title renaming
    title_text = rename_title(title_text)
    
    # Check if this slide should be excluded based on its title
    should_exclude = False
    for exclude in SECTIONS_TO_EXCLUDE:
        if exclude.lower() in title_text.lower():
            print(f"Skipping slide with title: '{title_text}' (matches exclusion criteria)")
            return
    
    # Prepare content for slides (clean and format)
    formatted_content = prepare_content_for_slides(title_text, content_text)
    
    # If no content remains after filtering, skip this slide
    if not formatted_content.strip():
        print(f"Warning: No appropriate content for slide '{title_text}' after filtering")
        return
    
    # Split content into multiple slides if needed
    slides_data = split_text(formatted_content, MAX_CHARS_PER_SLIDE)
    
    # Clean title and generate short title
    cleaned_title_text = clean_title(title_text)
    short_title = generate_short_title(cleaned_title_text)
    
    # Check if the short title contains any exclusion terms
    for exclude in SECTIONS_TO_EXCLUDE:
        if exclude.lower() in short_title.lower():
            print(f"Skipping slide with short title: '{short_title}' (matches exclusion criteria)")
            return
    
    if not slides_data:  # Handle empty content case
        return
    
    # Try to find a single best image for all slides in this section
    best_image = find_best_image_for_slide(short_title, formatted_content, image_descriptions, used_images)
    if best_image:
        # Validate the image assignment
        if validate_image_assignment(short_title, best_image, image_descriptions[best_image]):
            used_images.add(best_image)
            print(f"Selected image '{best_image}' for slide '{short_title}'")
        else:
            best_image = None  # Don't use this image if validation fails
        
    for i, slide_text in enumerate(slides_data):
        # Create new slide
        slide_layout = prs.slide_layouts[1]  # Title & Content Layout
        slide = prs.slides.add_slide(slide_layout)
        
        # Set the title with proper formatting
        title = slide.shapes.title
        slide_title = short_title if i == 0 else f"{short_title} (contd.)"
        title.text = slide_title
        
        # Format title text - ensure it's visible
        title_text_frame = title.text_frame
        title_text_frame.auto_size = MSO_AUTO_SIZE.TEXT_TO_FIT_SHAPE
        for paragraph in title_text_frame.paragraphs:
            paragraph.font.size = Pt(32)
            paragraph.font.bold = True
            paragraph.alignment = PP_ALIGN.CENTER
        
        # Get content placeholder
        content_shape = slide.placeholders[1]  # Content area (index 1)
        
        if not content_shape:
            print(f"Warning: No content placeholder found for slide {i+1}")
            continue
            
        # Determine if this slide should have an image
        has_image = best_image and i == 0
        
        # Get slide dimensions
        slide_width = prs.slide_width
        slide_height = prs.slide_height
        
        # *** IMPORTANT FIX: Properly layout content and image to prevent overlap ***
        if has_image:
            # Store original dimensions
            original_width = content_shape.width
            original_left = content_shape.left
            original_top = content_shape.top
            original_height = content_shape.height
            
            # Calculate new dimensions for text area (left side)
            # Reduce width to 55% of slide width to make space for image
            new_text_width = int(slide_width * 0.55)
            
            # Reposition and resize content shape BEFORE adding text
            content_shape.left = original_left
            content_shape.top = original_top
            content_shape.width = new_text_width
            # Keep the original height
            
            # Calculate image dimensions and position (right side)
            img_width = int(slide_width * 0.38)  # 38% of slide width
            img_left = content_shape.left + content_shape.width + Inches(0.2)  # Add spacing between text and image
            img_top = content_shape.top + Inches(0.1)  # Slight offset from content top
        
        # Process text content with proper sizing
        text_frame = content_shape.text_frame
        text_frame.clear()  # Clear any existing text
        
        # Control text frame properties
        text_frame.word_wrap = True
        text_frame.auto_size = MSO_AUTO_SIZE.SHAPE_TO_FIT_TEXT
        
        # Format content as properly formatted paragraphs
        lines = slide_text.strip().split("\n")
        
        # Only proceed if we have content
        if not lines or not lines[0].strip():
            continue
        
        # First paragraph
        first_line = lines[0].strip()
        p = text_frame.paragraphs[0]
        p.text = first_line
        p.level = 0  # Set indentation level
            
        # Apply consistent formatting
        p.font.size = BODY_FONT_SIZE
        p.font.bold = False
        p.space_after = Pt(10)
        
        # Remaining paragraphs - with consistent formatting
        for line in lines[1:]:
            line = line.strip()
            if not line:
                continue
                
            p = text_frame.add_paragraph()
            p.text = line
            p.level = 0  # Set indentation level
                
            # Apply consistent formatting to all paragraphs
            p.font.size = BODY_FONT_SIZE
            p.font.bold = False
            p.space_after = Pt(10)
        
        # Now add the image if we have one, after text content is set
        if has_image:
            image_path = os.path.join(image_folder, best_image)
            # Add image to the slide if the file exists
            if os.path.exists(image_path):
                try:
                    # CORRECTED CODE: Ensure absolute path and proper error handling
                    abs_image_path = os.path.abspath(image_path)
                    print(f"Adding image from path: {abs_image_path}")
                    
                    # Add the image with the calculated dimensions to prevent overlap
                    picture = slide.shapes.add_picture(
                        abs_image_path,
                        left=img_left,
                        top=img_top,
                        width=img_width
                    )
                    
                    # Optional: Check if image is too tall and resize proportionally if needed
                    if picture.height > (slide_height * 0.7):
                        height_ratio = (slide_height * 0.7) / picture.height
                        picture.height = int(picture.height * height_ratio)
                        picture.width = int(picture.width * height_ratio)
                    
                    print(f"Successfully added image '{best_image}' to slide '{slide_title}'")
                except Exception as e:
                    print(f"Error adding image '{best_image}' to slide: {str(e)}")
                    # Try alternative approach if first method fails
                    try:
                        picture = slide.shapes.add_picture(
                            abs_image_path,
                            left=Inches(5),  # Fixed position as fallback
                            top=Inches(2),
                            width=Inches(4)
                        )
                        print(f"Added image using fallback method")
                    except Exception as e2:
                        print(f"Second attempt failed: {str(e2)}")

def create_presentation(presentation_content, image_descriptions):
    """Generates a PowerPoint presentation with consistent formatting and relevant images."""
    
    prs = Presentation()
    
    # Keep track of used images to avoid duplicates
    used_images = set()

    # Add a Title Slide
    title_slide_layout = prs.slide_layouts[0]  # Title Slide Layout
    slide = prs.slides.add_slide(title_slide_layout)
    title = slide.shapes.title
    subtitle = slide.placeholders[1]  # Placeholder for subtitle

    title.text = "Ensemble Methods for High-Performance Classification"
    subtitle.text = "A Study on Adult Soft Tissue Sarcomas"
    
    # Format title slide text
    title.text_frame.paragraphs[0].font.size = Pt(44)
    subtitle.text_frame.paragraphs[0].font.size = Pt(28)
    
    # Try to find a suitable image for the title slide
    title_image = find_best_image_for_slide(
        "Ensemble Methods for High-Performance Classification", 
        "",  # Adding slide content
        image_descriptions,
        used_images
    )
    
    if title_image:
        used_images.add(title_image)
        image_path = os.path.join(image_folder, title_image)
        if os.path.exists(image_path):
            # Position title slide image at the bottom right
            img_width = Inches(3)
            img_left = prs.slide_width - img_width - Inches(0.5)  # Right side with margin
            img_top = prs.slide_height - Inches(2.5)  # Bottom with margin
            
            try:
                slide.shapes.add_picture(image_path, img_left, img_top, width=img_width)
                print(f"Added image '{title_image}' to title slide")
            except Exception as e:
                print(f"Error adding image to title slide: {e}")

    # Add Content Slides
    for section, content in presentation_content.items():
        # Apply any title renaming first
        section = rename_title(section)
        
        # Extra check to exclude any sections with "definitions and key" in the title
        should_exclude = False
        for exclude in SECTIONS_TO_EXCLUDE:
            if exclude.lower() in section.lower():
                print(f"Skipping section: '{section}' (matches exclusion criteria)")
                should_exclude = True
                break
        
        if not should_exclude:
            add_slide(prs, section, content, image_descriptions, used_images)

    # Save PowerPoint File
    try:
        prs.save(pptx_output_path)
        print(f"✅ Presentation saved at: {pptx_output_path}")
    except PermissionError:
        # Try with a different filename if permission error
        alt_path = pptx_output_path.replace(".pptx", "_new.pptx")
        prs.save(alt_path)
        print(f"✅ Presentation saved at: {alt_path}")

def prepare_image_content(folder_path, content_file_path, source_pdf_path=None, force_refresh=False):
    """
    Prepare image descriptions, using cache if appropriate.
    
    Args:
        folder_path: Path to folder containing extracted images
        content_file_path: Path to content file
        source_pdf_path: Path to source PDF (used for cache invalidation)
        force_refresh: Whether to force regeneration of descriptions
        
    Returns:
        Dictionary of image descriptions
    """
    import os
    import json
    import hashlib
    import time
    
    # Define a cache file path based on the content file path
    cache_dir = os.path.dirname(content_file_path)
    cache_file = os.path.join(cache_dir, "image_descriptions_cache.json")
    
    # Get folder modification time to detect new images
    try:
        folder_mtime = os.path.getmtime(folder_path)
    except:
        folder_mtime = time.time()  # Default to current time if folder doesn't exist
    
    # Calculate hash of source PDF if available (to detect new source documents)
    source_hash = None
    if source_pdf_path and os.path.exists(source_pdf_path):
        try:
            with open(source_pdf_path, 'rb') as f:
                content = f.read(8192)  # Read first 8KB to create hash
                source_hash = hashlib.md5(content).hexdigest()
        except Exception as e:
            print(f"Warning: Could not hash source PDF: {e}")
    
    # Try to load cached descriptions if they exist and we're not forcing refresh
    if os.path.exists(cache_file) and not force_refresh:
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                cache_data = json.load(f)
                
                # Check cache validity - needs same source document and folder hasn't been modified
                cache_is_valid = True
                
                # Check cache version
                if cache_data.get('version', 0) != 1:
                    cache_is_valid = False
                    print("Cache version mismatch - regenerating")
                
                # Check if source document changed
                if source_hash and cache_data.get('source_hash') != source_hash:
                    cache_is_valid = False
                    print("Source document changed - regenerating descriptions")
                
                # Check if image folder was modified after cache creation
                if cache_data.get('folder_mtime', 0) < folder_mtime:
                    cache_is_valid = False
                    print("Image folder modified - regenerating descriptions")
                
                # Return descriptions if cache is valid
                if cache_is_valid:
                    print(f"Loading cached image descriptions from {cache_file}")
                    return cache_data.get('descriptions', {})
                    
        except Exception as e:
            print(f"Error loading cache: {e}. Generating new descriptions.")
    
    # If no cache, invalid cache, or error, generate new descriptions
    print("Generating new image descriptions...")
    image_descriptions = describe_images_in_folder(folder_path)
    
    # Save the descriptions to cache for future use
    try:
        cache_data = {
            'version': 1,  # Cache version for future compatibility
            'timestamp': time.time(),
            'folder_mtime': folder_mtime,
            'source_hash': source_hash,
            'descriptions': image_descriptions
        }
        
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(cache_data, f, indent=2)
            print(f"Image descriptions cached to {cache_file}")
    except Exception as e:
        print(f"Error saving cache: {e}")
    
    return image_descriptions

# Update the main function to use the improved prepare_image_content function
def main(source_pdf_path=None, force_refresh_cache=False):
    """
    Main execution flow with improved caching
    
    Args:
        source_pdf_path: Path to source PDF document (optional)
        force_refresh_cache: Whether to force regeneration of image descriptions
    """
    # Get image descriptions (with improved caching)
    print("Preparing image descriptions...")
    image_descriptions = prepare_image_content(
        image_folder, 
        content_file_path,
        source_pdf_path=source_pdf_path,
        force_refresh=force_refresh_cache
    )
    
    # Read presentation content
    print("Reading presentation content...")
    presentation_content = read_presentation_content(content_file_path)
    
    # Generate PowerPoint with images
    print("Creating presentation with images...")
    create_presentation(presentation_content, image_descriptions)
    
    print("Done!")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pandl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pandl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pandl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preparing image descriptions...
Image folder modified - regenerating descriptions
Generating new image descriptions...
Processing: image_page10_img1.jpeg
Processing: image_page10_img2.jpeg
Processing: image_page11_img1.jpeg
Processing: image_page11_img2.jpeg
Processing: image_page1_img1.jpeg
Processing: image_page1_img2.jpeg
Processing: image_page1_img3.jpeg
Processing: image_page5_img1.jpeg
Processing: image_page7_img1.jpeg
Processing: image_page8_img3.jpeg
Processing: image_page9_img3.jpeg
Image descriptions cached to C:\Users\pandl\OneDrive\Desktop\FYP\image_descriptions_cache.json
Reading presentation content...
Creating presentation with images...
Selected image 'image_page11_img2.jpeg' for slide 'Introduction'
  Section type: introduction, Score: 4.37
Note: Graph/chart image 'image_page11_img2.jpeg' might not be ideal for 'Introduction', but allowing it.
Selected image 'image_page11_img2.jpeg' for slide 'Introduction'
Adding image from path: C:\Users\pandl\OneDrive\Desktop\FYP\Ex