In [1]:
!pip install transformers
!pip install python-pptx

# ======== 1. CORE DEPENDENCIES (versions pinned) ========
!pip install --upgrade pip

!pip install \
  keybert \
  PyMuPDF \
  transformers \
  python-docx \
  Pillow \
  nltk \
  spacy \
  en-core-web-sm \
  scikit-learn \
  torch \
  torchvision \
  sentence-transformers \
  sentencepiece \
  chromadb \
  langchain \
  pytesseract \
  pdfplumber \
  git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-ntzhj_t8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-ntzhj_t8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


# Normal Model

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
import os
import io
import re
import fitz  # PyMuPDF
import docx
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel, pipeline
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.tokenize import sent_tokenize
from google.colab import files
import chromadb
from chromadb.utils import embedding_functions
from typing import List, Dict, Union
import uuid


# -------------------- Constants --------------------

TEXT_MODEL_ID = 'paraphrase-multilingual-MiniLM-L12-v2'  # For KeyBERT only
IMAGE_MODEL_ID = 'openai/clip-vit-base-patch32'
TITLE_WORD_RANGE = (3, 12)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# -------------------- Document Extraction --------------------

class DocumentExtractor:
    def __init__(self, file_path):
        self.file_path = file_path
        self.extension = os.path.splitext(file_path)[-1].lower()

    def extract_text(self):
        if self.extension == '.pdf':
            return self._extract_text_from_pdf()
        elif self.extension == '.docx':
            return self._extract_text_from_docx()
        else:
            raise ValueError("Unsupported file type")

    def extract_images(self):
        if self.extension == '.pdf':
            return self._extract_images_from_pdf()
        elif self.extension == '.docx':
            return self._extract_images_from_docx()
        else:
            raise ValueError("Unsupported file type")

    def _extract_text_from_pdf(self):
        text = ""
        doc = fitz.open(self.file_path)
        for page in doc:
            text += page.get_text()
        return text

    def _extract_text_from_docx(self):
        doc = docx.Document(self.file_path)
        return "\n".join([para.text for para in doc.paragraphs])

    def _extract_images_from_pdf(self):
        doc = fitz.open(self.file_path)
        images = []
        for page in doc:
            for img in page.get_images(full=True):
                try:
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image = Image.open(io.BytesIO(base_image["image"])).convert("RGB")
                    images.append(image)
                except Exception as e:
                    print(f"❌ PDF image error: {e}")
        return images

    def _extract_images_from_docx(self):
        doc = docx.Document(self.file_path)
        images = []
        for rel in doc.part.rels.values():
            if "image" in rel.target_ref:
                try:
                    image = Image.open(io.BytesIO(rel.target_part.blob)).convert("RGB")
                    images.append(image)
                except Exception as e:
                    print(f"❌ DOCX image error: {e}")
        return images


# -------------------- Title Extraction --------------------

class TitleExtractor:
    def __init__(self, lines=None, full_text=None, model_id=None,
                 min_words=3, max_words=12,
                 keyphrase_ngram_range=(2,4), stop_words='english'):
        self.lines = lines or []
        self.full_text = full_text or ""
        self.model_id = model_id
        self.model = KeyBERT(model=model_id) if model_id else KeyBERT()

        self.min_words = min_words
        self.max_words = max_words
        self.keyphrase_ngram_range = keyphrase_ngram_range
        self.stop_words = stop_words

    def _filter_lines(self):
        # Filter lines by length and initial uppercase letter
        return [
            line for line in self.lines
            if self.min_words <= len(line.split()) <= self.max_words and re.match(r"^[A-Z]", line)
        ]

    def extract_candidate_titles(self):
        filtered_lines = self._filter_lines()
        # Batch extraction of keywords from lines
        titles = set()
        for line in filtered_lines:
            keywords = self.model.extract_keywords(
                line,
                keyphrase_ngram_range=self.keyphrase_ngram_range,
                stop_words=self.stop_words,
                top_n=1
            )
            if keywords:
                titles.add(keywords[0][0])
        return list(titles)

    def extract_semantic_titles(self, top_n=10):
        sentences = sent_tokenize(self.full_text)
        joined_text = " ".join(sentences)
        keywords = self.model.extract_keywords(
            joined_text,
            keyphrase_ngram_range=self.keyphrase_ngram_range,
            stop_words=self.stop_words,
            use_maxsum=True,
            top_n=top_n
        )
        return [kw[0] for kw in keywords]


# -------------------- CLIP Text Embedding --------------------

class CLIPTextEmbedder:
    def __init__(self, model_id=IMAGE_MODEL_ID, device=DEVICE):
        self.device = device
        self.model = CLIPModel.from_pretrained(model_id).to(device)
        self.processor = CLIPProcessor.from_pretrained(model_id)

    def encode(self, texts, batch_size=32):
        """Encode text using CLIP text encoder"""
        embeddings = []

        # Process in batches to avoid memory issues
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True).to(self.device)

            with torch.no_grad():
                text_features = self.model.get_text_features(**inputs)
                text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
                embeddings.append(text_features.cpu().numpy())

        return np.vstack(embeddings)


# -------------------- Image Embedding --------------------
class ImageEmbedder:
    def __init__(self, model_id=IMAGE_MODEL_ID, device=DEVICE):
        self.device = device
        self.model = CLIPModel.from_pretrained(model_id).to(device)
        self.processor = CLIPProcessor.from_pretrained(model_id)

    def encode(self, images):
        embeddings = []
        for img in images:
            inputs = self.processor(images=img, return_tensors="pt").to(self.device)
            with torch.no_grad():
                features = self.model.get_image_features(**inputs)
                features /= features.norm(p=2, dim=-1, keepdim=True)
                embeddings.append(features.cpu().numpy())
        return np.vstack(embeddings)  # Stack the list of arrays into a single array


#---------------Vector DB-------------------------------

class ChromaDBManager:
    def __init__(self, persist_dir: str = "./chroma_db"):
        self.client = chromadb.PersistentClient(path=persist_dir)

        # Only need collections for content we'll retrieve
        self.text_collection = self.client.get_or_create_collection(
            name="document_texts",
            embedding_function=None  # We'll provide pre-computed embeddings
        )
        self.image_collection = self.client.get_or_create_collection(
            name="document_images",
            embedding_function=None
        )

    def store_texts(self, texts: List[str], embeddings: np.ndarray) -> List[str]:
        """Store text embeddings"""
        ids = [str(uuid.uuid4()) for _ in texts]
        self.text_collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings.tolist()
        )
        return ids

    def store_images(self, image_indices: List[int], embeddings: np.ndarray) -> List[str]:
        """Store image embeddings"""
        ids = [str(uuid.uuid4()) for _ in image_indices]
        self.image_collection.add(
            ids=ids,
            # uris=image_indices,
            embeddings=embeddings.tolist(),
            metadatas=[{"index": idx} for idx in image_indices]
        )
        return ids


    def query_texts(self, query_embedding: np.ndarray, n_results: int = 5) -> List[str]:
        """Query similar texts"""
        results = self.text_collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=n_results
        )
        return results['documents'][0] if results['documents'] else []

    def query_images(self, query_embedding: np.ndarray, n_results: int = 3) -> List[int]:
        """Query similar images and return their indices"""
        results = self.image_collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=n_results
        )
        if not results['metadatas']:
            return []

        # Extract indices from metadata
        return [meta['index'] for meta in results['metadatas'][0] if 'index' in meta]

# -------------------- Slide Builder --------------------

class SlideBuilder:
    def __init__(self, titles, title_embeddings, lines, images,
                 db_manager: ChromaDBManager,
                 text_embedder: CLIPTextEmbedder,
                 max_bullets_per_slide=5, overlap_bullets=1,
                 summarization_model="facebook/bart-large-cnn"):

        self.titles = titles
        self.title_embeddings = title_embeddings
        self.lines = lines
        self.images = images
        self.db_manager = db_manager
        self.max_bullets = max_bullets_per_slide
        self.overlap_bullets = overlap_bullets
        self.summarizer = pipeline("summarization", model=summarization_model)
        self.text_embedder = text_embedder

    def _summarize_text(self, text):
        """Summarize text into bullet points"""
        if not text or len(text.strip()) < 50:
            return [text.strip()] if text.strip() else []

        try:
            max_input_length = 1024
            if len(text) > max_input_length:
                text = text[:max_input_length]

            summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
            bullets = [b.strip() for b in re.split(r'[.;]\s*', summary) if b.strip()]
            return bullets if bullets else [summary]

        except Exception as e:
            print(f"Summarization error: {e}")
            return sent_tokenize(text)[:self.max_bullets]

    def build_slides(self):
        """Build slides using ChromaDB for all retrieval operations"""
        slides = []
        previous_bullets = []

        for title, title_emb in zip(self.titles, self.title_embeddings):
            # Query relevant text using title embedding
            relevant_texts = self.db_manager.query_texts(title_emb, n_results=20)
            if not relevant_texts:
                slides.append({
                    'title': title,
                    'bullets': [f"No relevant content found for: {title}"],
                    'images': []
                })
                continue

            # Summarize the relevant text
            combined_text = " ".join(relevant_texts)
            bullets = self._summarize_text(combined_text)

            # Split into slides with overlap
            start_idx = 0
            while start_idx < len(bullets):
              if start_idx > 0 and previous_bullets:
                  overlap = previous_bullets[-self.overlap_bullets:]
              else:
                  overlap = []

              end_idx = start_idx + self.max_bullets
              slide_bullets = overlap + bullets[start_idx:end_idx]
              slides.append({
                  'title': title,
                  'bullets': slide_bullets,
                  'images': self._retrieve_images(title_emb)
              })
              previous_bullets = slide_bullets
              start_idx = end_idx


        return slides

    def _retrieve_images(self, query_embedding):
      image_indices = self.db_manager.query_images(query_embedding, n_results=3)
      return [self.images[idx] for idx in image_indices if idx < len(self.images)]


    def _load_image(self, image_index):
        try:
            if isinstance(image_index, int) and 0 <= image_index < len(self.images):
                return self.images[image_index]
        except Exception as e:
            print(f"Error loading image at index {image_index}: {e}")
        return None


    def _image_exists(self, image_index):
        """Check if image index is valid"""
        return isinstance(image_index, int) and 0 <= image_index < len(self.images)

# -------------------- Pipeline Orchestration --------------------
class DocumentProcessingPipeline:
    def __init__(self, file_path, use_semantic=True, persist_db: bool = True, top_titles=20):
        self.file_path = file_path
        self.extractor = DocumentExtractor(file_path)
        self.text_embedder = CLIPTextEmbedder()
        self.image_embedder = ImageEmbedder()
        self.title_extractor = None
        self.use_semantic = use_semantic
        self.top_titles = top_titles
        self.db_manager = ChromaDBManager() if persist_db else None

    def run(self):
        print(f"Extracting content from {self.file_path}")
        raw_text = self.extractor.extract_text()
        images = self.extractor.extract_images()
        lines = [line.strip() for line in raw_text.split('\n') if line.strip()]

        self.title_extractor = TitleExtractor(lines=lines, full_text=raw_text)
        titles = self.title_extractor.extract_semantic_titles(top_n=self.top_titles) if self.use_semantic else self.title_extractor.extract_candidate_titles()

        # Embed all content
        text_embeddings = self.text_embedder.encode(lines)
        image_embeddings = self.image_embedder.encode(images) if images else []
        title_embeddings = self.text_embedder.encode(titles)

        # Store in ChromaDB if enabled
        if self.db_manager:
            self.db_manager.store_texts(lines, text_embeddings)
            if images:
                image_indices = list(range(len(images)))
                self.db_manager.store_images(image_indices, image_embeddings)

        return {
            "titles": titles,
            "title_embeddings": title_embeddings,
            "lines": lines,
            "images": images,
            "db_manager": self.db_manager,
            "text_embeddings": text_embeddings,
            "image_embeddings": image_embeddings,
            "text_embedder": self.text_embedder,
            "image_embedder": self.image_embedder,
            "title_extractor": self.title_extractor,
            "use_semantic": self.use_semantic,
            "raw_text": raw_text,
        }

# New Model

1. *Download Dependencies*

## spaCy & NLTK

In [2]:
# 1) spaCy model
!python -m spacy download en_core_web_sm

# 2) NLTK data
import nltk, os
nltk_data_path = "/content/nltk_data"
os.makedirs(nltk_data_path, exist_ok=True)

for pkg in ["punkt", "stopwords", "wordnet", "averaged_perceptron_tagger"]:
    nltk.download(pkg, download_dir=nltk_data_path)

# Tell NLTK to also look in that folder:
nltk.data.path.append(nltk_data_path)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /content/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Important Imports

In [3]:
# === Standard Library ===
import os
import io
import uuid
import json
import re
from typing import List, Dict, Tuple, Any, Union, Optional, Literal
from dataclasses import dataclass
import numpy as np
import google.generativeai as genai
import warnings

# === Third-Party Libraries ===

# NLP and Transformers
from transformers import (
    CLIPProcessor,
    CLIPModel,
    BlipProcessor,
    BlipForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
    T5ForConditionalGeneration,
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
)
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from keybert import KeyBERT

# Deep Learning and Vision
import torch
import clip
from torchvision import transforms
from PIL import Image

# PDF, DOCX, and file handling
import fitz            # PyMuPDF
import docx            # python-docx
import pdfplumber
import pandas as pd    # For table handling
from google.colab import files

# Vector DB and text processing
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
import collections

2025-06-12 22:46:40.985768: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749768401.008600     575 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749768401.015696     575 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### The Model

In [4]:
# -------------------- Constants --------------------
TEXT_MODEL_ID = 'paraphrase-multilingual-MiniLM-L12-v2'  # For KeyBERT only
IMAGE_MODEL_ID = 'openai/clip-vit-base-patch32'
TITLE_WORD_RANGE = (3, 12)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# -------------------- Document Extraction --------------------
class DocumentExtractor:
    def __init__(self):
        self.supported_formats = {
            '.pdf':  self.extract_from_pdf,
            '.docx': self.extract_from_docx,
        }

    def extract(self, path: str) -> Tuple[List[Dict], List[Dict], List[Dict]]:
        ext = os.path.splitext(path)[1].lower()
        if ext not in self.supported_formats:
            raise ValueError(f"Unsupported format: {ext}")
        return self.supported_formats[ext](path)

    def extract_from_pdf(self, pdf_path: str):
        texts, images, tables = [], [], []
        doc = fitz.open(pdf_path)

        # 1a) text & images via PyMuPDF
        for page_num, page in enumerate(doc, start=1):
            texts.append({
                'page': page_num,
                'content': page.get_text() or ""
            })
            try:
                for img_idx, img in enumerate(page.get_images(full=True), start=1):
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)
                    if pix.n - pix.alpha > 3:
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    data = pix.tobytes("png")
                    pil = Image.open(io.BytesIO(data))
                    images.append({
                        'page': page_num,
                        'image_index': img_idx,
                        'image': pil
                    })
            except Exception as e:
                print(f"Error extracting image on page {page_num}: {str(e)}")
                continue

        doc.close()

        # 1b) tables via pdfplumber
        if pdfplumber:
            with pdfplumber.open(pdf_path) as plumber:
                for page_num, page in enumerate(plumber.pages, start=1):
                    for tbl_idx, raw in enumerate(page.extract_tables(), start=1):
                        tables.append({
                            'page': page_num,
                            'table_index': tbl_idx,
                            'table': raw   # List[List[str]]
                        })

        return texts, images, tables

    def extract_from_docx(self, docx_path: str):
        doc = docx.Document(docx_path)
        texts, images, tables = [], [], []

        # 2a) paragraphs
        for i, para in enumerate(doc.paragraphs, start=1):
            texts.append({'paragraph': i, 'content': para.text})

        # 2b) inline images
        for i, shape in enumerate(doc.inline_shapes, start=1):
            rid = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
            part = doc.part.related_parts[rid]
            pil = Image.open(io.BytesIO(part.blob))
            images.append({'paragraph': None, 'image_index': i, 'image': pil})

        # 2c) tables
        for t_i, table in enumerate(doc.tables, start=1):
            rows = [[cell.text for cell in row.cells] for row in table.rows]
            tables.append({
                'table_index': t_i,
                'table': rows
            })

        return texts, images, tables

# -------------------- Title Extraction --------------------
class TitleExtractor:
    def __init__(self, lines=None, full_text=None, model_id=None,
                 min_words=3, max_words=12,
                 keyphrase_ngram_range=(2,4), stop_words='english'):
        self.lines = lines or []
        self.full_text = full_text or ""
        self.model_id = model_id
        self.model = KeyBERT(model=model_id) if model_id else KeyBERT()

        self.min_words = min_words
        self.max_words = max_words
        self.keyphrase_ngram_range = keyphrase_ngram_range
        self.stop_words = stop_words

    def extract_titles(self, option: Literal['fontbased', 'semantic', 'candidate'], pdf_path: str = None, top_n: int = 10) -> List[str]:
        if option == 'fontbased':
            if not pdf_path:
                raise ValueError("pdf_path is required for fontbased title extraction.")
            return self.extract_titles_fontbased(pdf_path, top_n_fonts=2)
        elif option == 'semantic':
            return self.extract_semantic_titles(top_n=top_n)
        elif option == 'candidate':
            return self.extract_candidate_titles()
        else:
            raise ValueError(f"Invalid title extraction option: {option}")

    def _filter_lines(self):
        # Filter lines by length and initial uppercase letter
        return [
            line for line in self.lines
            if self.min_words <= len(line.split()) <= self.max_words and re.match(r"^[A-Z]", line)
        ]

    def extract_candidate_titles(self):
        filtered_lines = self._filter_lines()
        # Batch extraction of keywords from lines
        titles = set()
        for line in filtered_lines:
            keywords = self.model.extract_keywords(
                line,
                keyphrase_ngram_range=self.keyphrase_ngram_range,
                stop_words=self.stop_words,
                top_n=1
            )
            if keywords:
                titles.add(keywords[0][0])
        return list(titles)

    def extract_semantic_titles(self, top_n=10):
        sentences = sent_tokenize(self.full_text)
        joined_text = " ".join(sentences)
        keywords = self.model.extract_keywords(
            joined_text,
            keyphrase_ngram_range=self.keyphrase_ngram_range,
            stop_words=self.stop_words,
            use_maxsum=True,
            top_n=top_n
        )
        return [kw[0] for kw in keywords]

    def extract_titles_fontbased(self, pdf_path: str, top_n_fonts: int = 2) -> List[str]:

        all_spans = []

        # Single pass: extract all spans with size and content
        with fitz.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf):
                blocks = page.get_text("dict")["blocks"]
                for block in blocks:
                    if "lines" in block:
                        for line in block["lines"]:
                            for span in line["spans"]:
                                text = span["text"].strip()
                                if text:
                                    all_spans.append({
                                        "text": text,
                                        "size": span["size"],
                                        "page": page_num + 1,
                                        "bbox": span["bbox"]
                                    })

        # Identify top N font sizes
        font_sizes = np.array([span["size"] for span in all_spans])
        top_fonts = sorted(np.unique(font_sizes))[-top_n_fonts:]

        # Filter and collect titles
        titles = [
            span["text"] for span in all_spans
            if span["size"] in top_fonts
        ]

        # Optional: sort by page and vertical position
        titles_with_position = [
            (span["text"], span["page"], span["bbox"][1])
            for span in all_spans if span["size"] in top_fonts
        ]
        titles_sorted = sorted(titles_with_position, key=lambda x: (x[1], x[2]))

        # Return just the text
        return [text for text, _, _ in titles_sorted]

#----------------Query constructed--------------
class QueryConstructorLM:
    def __init__(self, model_name="google/flan-t5-large"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def reconstruct(self, titles: List[str], max_length=64) -> List[str]:
        """
        Rewrites a list of section titles into detailed search queries.

        Args:
            titles: List of section title strings.
            max_length: Max token length for generated queries.

        Returns:
            List of rewritten queries corresponding to each input title.
        """
        # Prepare prompts for all titles
        prompts = [
            "Convert section titles into research questions:\n"
            "Title: Introduction → What is the background and motivation of this study?\n"
            "Title: Evaluation → How was the model's performance evaluated?\n"
            f"Title: {title} →"
            for title in titles
        ]

        # Tokenize all prompts with padding for batch processing
        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

        # Generate outputs in batch
        output_ids = self.model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

        # Decode each generated output
        rewritten_queries = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        # print(f"reconstructed_queries: {rewritten_queries}\n")
        return rewritten_queries

#-------------------------Chunking------------------
class AdaptiveChunker:
    def __init__(self,
                 base_chunk_size: int = 200,
                 chunk_overlap: int = 50):
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=base_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def chunk_text(self, texts: List[Dict]) -> List[Dict]:
        """Split text into chunks and assign minimal, non-duplicate metadata."""
        chunks = []
        for ti, text_data in enumerate(texts):
            parts = self.splitter.split_text(text_data['content'])
            for ci, part in enumerate(parts):
                metadata = {
                    'page': text_data.get('page', text_data.get('paragraph')),
                    'chunk_index': ci,
                    'total_chunks': len(parts),
                    'chunk_type': "text"
                }
                chunk = {
                    'text': part.strip(),
                    'metadata': metadata
                }

                chunks.append(chunk)
        return chunks

# -------------------- CLIP Text Embedding --------------------
class CLIPTextEmbedder:
    def __init__(self, model_id=IMAGE_MODEL_ID, device=DEVICE):
        self.device = device
        self.model = CLIPModel.from_pretrained(model_id).to(device)
        self.processor = CLIPProcessor.from_pretrained(model_id)

    def encode(self, texts, batch_size=32):
        """Encode text using CLIP text encoder"""
        embeddings = []

        # Process in batches to avoid memory issues
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = self.processor(text=batch_texts, return_tensors="pt", padding=True, truncation=True).to(self.device)

            with torch.no_grad():
                text_features = self.model.get_text_features(**inputs)
                text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
                embeddings.append(text_features.cpu().numpy())

        return np.vstack(embeddings)

# -------------------- Image Embedding --------------------
class ImageEmbedder:
    def __init__(self, model_id=IMAGE_MODEL_ID, device=DEVICE):
        self.device = device
        self.model = CLIPModel.from_pretrained(model_id).to(device)
        self.processor = CLIPProcessor.from_pretrained(model_id)

    def encode(self, images):
        embeddings = []
        for img in images:
            inputs = self.processor(images=img, return_tensors="pt").to(self.device)
            with torch.no_grad():
                features = self.model.get_image_features(**inputs)
                features /= features.norm(p=2, dim=-1, keepdim=True)
                embeddings.append(features.cpu().numpy())
        return np.vstack(embeddings)  # Stack the list of arrays into a single array

#---------------Vector DB-------------------------------
class ChromaDBManager:
    def __init__(self, persist_dir: str = "./chroma_db"):
        self.client = chromadb.PersistentClient(path=persist_dir)

        # Only need collections for content we'll retrieve
        self.text_collection = self.client.get_or_create_collection(
            name="document_texts",
            embedding_function=None  # We'll provide pre-computed embeddings
        )
        self.image_collection = self.client.get_or_create_collection(
            name="document_images",
            embedding_function=None
        )

    def store_texts(self, texts: List[str], embeddings: np.ndarray) -> List[str]:
        """Store text embeddings"""
        ids = [str(uuid.uuid4()) for _ in texts]
        self.text_collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings.tolist()
        )
        return ids

    def store_images(self, image_indices: List[int], embeddings: np.ndarray) -> List[str]:
        """Store image embeddings"""
        ids = [str(uuid.uuid4()) for _ in image_indices]
        self.image_collection.add(
            ids=ids,
            # uris=image_indices,
            embeddings=embeddings.tolist(),
            metadatas=[{"index": idx} for idx in image_indices]
        )
        return ids


    def query_texts(self, query_embedding: np.ndarray, n_results: int = 5) -> List[str]:
        """Query similar texts"""
        results = self.text_collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=n_results
        )
        return results['documents'][0] if results['documents'] else []

    def query_images(self, query_embedding: np.ndarray, n_results: int = 3) -> List[int]:
        """Query similar images and return their indices"""
        results = self.image_collection.query(
            query_embeddings=query_embedding.tolist(),
            n_results=n_results
        )
        if not results['metadatas']:
            return []

        # Extract indices from metadata
        return [meta['index'] for meta in results['metadatas'][0] if 'index' in meta]

#--------- Summarizer Class----------------------------------
class Summarizer:
    """
    Handles text summarization using multiple backends and prompt strategies.
    Falls back to a local Hugging Face model if the primary API fails.
    """
    def __init__(self, gemini_api_key: str = "AIzaSyDyh7ZgcDa_FGekrfoNMnRt5UESG_iBgfI", hf_model: str = "facebook/bart-large-cnn"):
        self.hf_summarizer = pipeline("summarization", model=hf_model)
        self.gemini_model = None

        if gemini_api_key:
            try:
                genai.configure(api_key=gemini_api_key)
                self.gemini_model = genai.GenerativeModel('gemini-2.0-flash-lite')
                print("INFO: Google Gemini initialized successfully.")
            except Exception as e:
                warnings.warn(f"Failed to initialize Gemini, will use Hugging Face only. Error: {e}")
        else:
            print("INFO: No Gemini API key provided. Using Hugging Face model as default.")

    def _get_zero_shot_prompt(self, text: str) -> str:
        return f"""
        You are an expert document analyst tasked with creating
        a comprehensive summary of the entire document.
        Summarize the following text into a list of concise, factual bullet points.
        Each bullet point should capture a key piece of information.

        TEXT:
        "{text}"

        GUIDLINES:
        Directly output the bullet points without any introductory sentences like "Here is a summary:

        BULLET POINTS:
        """

    def _get_few_shot_prompt(self, text: str) -> str:
        return f"""
        You are an expert document analyst tasked with creating
        a comprehensive summary of the entire document.
        Summarize the following text into a list of concise, factual bullet points.
        Each bullet point should capture a key piece of information.

        GUIDLINES:
        Directly output the bullet points without any introductory sentences like "Here is a summary:

        ---
        Example 1:
        TEXT: "The sun is a star at the center of the Solar System. It is a nearly perfect ball of hot plasma, heated to incandescence by nuclear fusion reactions in its core."
        BULLET POINTS:
        - The sun is the star at the center of our Solar System.
        - It is a ball of hot plasma.
        - Nuclear fusion in its core generates heat.
        ---
        Example 2:
        TEXT: "Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System, being larger than only Mercury. In English, Mars carries the name of the Roman god of war and is often referred to as the 'Red Planet'."
        BULLET POINTS:
        - Mars is the fourth planet from the Sun.
        - It is the second-smallest planet in the Solar System.
        - It is named after the Roman god of war.
        - It is commonly called the "Red Planet".
        ---
        Your Turn:
        TEXT: "{text}"
        BULLET POINTS:
        """

    def _get_cot_prompt(self, text: str) -> str:
        return f"""
        You are an expert document analyst tasked with creating
        a comprehensive summary of the entire document.
        Summarize the following text into a list of concise, factual bullet points.
        Each bullet point should capture a key piece of information.

        GUIDLINES:
        Directly output the bullet points without any introductory sentences like "Here is a summary:


        Summarize the text below by following these steps:
        1. First, identify the 3-5 most important concepts or key takeaways from the text.
        2. Second, based on those key takeaways, generate a list of concise bullet points that accurately represent the text.

        TEXT:
        "{text}"

        RESPONSE:
        """

    def _to_bullets(self, summary_text: str) -> list[str]:
        """Cleans and splits a block of text into a list of bullet points."""
        # Remove markdown-style bullets and split by newlines or punctuation
        cleaned_text = re.sub(r'^\s*[-*]\s*', '', summary_text, flags=re.MULTILINE)
        bullets = [b.strip() for b in re.split(r'\n|[.;]\s*', cleaned_text) if b.strip()]
        return bullets if bullets else [summary_text]

    def summarize(self, text: str, strategy: str = 'zero-shot') -> list[str]:
        """
        Summarizes text into bullet points. Tries Gemini first, then falls back to HF.

        Args:
            text (str): The text to summarize.
            strategy (str): The prompt strategy to use ('zero-shot', 'few-shot', 'cot').
                            This only applies to the Gemini backend.

        Returns:
            list[str]: A list of bullet points.
        """
        if not text or len(text.strip()) < 50:
            return [text.strip()] if text.strip() else []

        # --- Attempt 1: Use Gemini with specified strategy ---
        if self.gemini_model:
            try:
                print(f"INFO: Attempting summarization with Gemini (strategy: {strategy})...")
                prompt_map = {
                    'zero-shot': self._get_zero_shot_prompt,
                    'few-shot': self._get_few_shot_prompt,
                    'cot': self._get_cot_prompt,
                }
                prompt_func = prompt_map.get(strategy, self._get_zero_shot_prompt)
                prompt = prompt_func(text)

                response = self.gemini_model.generate_content(prompt)
                summary = response.text
                return self._to_bullets(summary)

            except Exception as e:
                warnings.warn(f"Gemini summarization failed: {e}. Falling back to Hugging Face model.")

        # --- Attempt 2: Fallback to Hugging Face ---
        print("INFO: Using fallback Hugging Face model for summarization...")
        try:
            # Truncate text for models with limited input size
            max_input_length = 1024
            truncated_text = text[:max_input_length] if len(text) > max_input_length else text

            summary_result = self.hf_summarizer(truncated_text, max_length=150, min_length=40, do_sample=False)
            summary = summary_result[0]['summary_text']
            return self._to_bullets(summary)

        except Exception as e:
            warnings.warn(f"Hugging Face summarization also failed: {e}")
            # Final fallback: just split the original text into sentences
            from nltk.tokenize import sent_tokenize
            return sent_tokenize(text)[:5]

#----------------SlideBuilder-----------------------------
class SlideBuilder:
    def __init__(self, titles, title_embeddings, lines, images: list, # Type hint clarifies it's a list
                 db_manager: ChromaDBManager,
                 summarizer: Summarizer,
                 max_bullets_per_slide=5, overlap_bullets=1):

        self.titles = titles
        self.title_embeddings = title_embeddings
        self.lines = lines
        self.images = images # This is now a list
        self.db_manager = db_manager
        self.summarizer = summarizer
        self.max_bullets = max_bullets_per_slide
        self.overlap_bullets = overlap_bullets

        # REMOVED: self.image_indices = list(images.keys())
        # This line was incorrect for a list and has been removed.

    def build_slides(self, summarization_strategy: str = 'zero-shot', max_image_occurrences: int = 1):
        """Builds slides with controlled image deduplication."""
        slides = []
        previous_bullets = []
        image_usage_counter = collections.Counter()

        for title, title_emb in zip(self.titles, self.title_embeddings):
            relevant_texts = self.db_manager.query_texts(title_emb, n_results=20)
            if not relevant_texts:
                slides.append({'title': title, 'bullets': [f"No content for: {title}"], 'images': []})
                continue

            combined_text = " ".join(relevant_texts)
            bullets = self.summarizer.summarize(combined_text, strategy=summarization_strategy)

            start_idx = 0
            while start_idx < len(bullets):
                overlap = previous_bullets[-self.overlap_bullets:] if start_idx > 0 and previous_bullets else []
                end_idx = start_idx + self.max_bullets - len(overlap)
                slide_bullets = overlap + bullets[start_idx:end_idx]

                selected_image_indices = self._retrieve_images(
                    query_embedding=title_emb,
                    image_usage_counter=image_usage_counter,
                    max_occurrences=max_image_occurrences
                )

                for img_idx in selected_image_indices:
                    image_usage_counter[img_idx] += 1

                # This line works correctly for a list, e.g., self.images[0]
                slide_images = [self.images[idx] for idx in selected_image_indices]

                slides.append({
                    'title': title,
                    'bullets': slide_bullets,
                    'images': slide_images
                })

                previous_bullets = slide_bullets
                start_idx = end_idx
                if not bullets[start_idx:end_idx]:
                    break
        return slides

    def _retrieve_images(self, query_embedding, image_usage_counter, max_occurrences, n_results=3, candidate_pool_size=20):
        """Retrieves relevant and non-overused images from a list."""
        selected_indices = []
        candidate_indices = self.db_manager.query_images(query_embedding, n_results=candidate_pool_size)

        for idx in candidate_indices:
            if len(selected_indices) >= n_results:
                break

            # The logic relies on the corrected _image_exists method
            if self._image_exists(idx) and image_usage_counter[idx] < max_occurrences:
                if idx not in selected_indices:
                    selected_indices.append(idx)

        return selected_indices

    # --- THIS IS THE CRITICAL CHANGE ---
    def _image_exists(self, image_index):
        """Checks if an image index is valid for the images list."""
        # An index is valid if it's an integer and falls within the list's bounds.
        return isinstance(image_index, int) and 0 <= image_index < len(self.images)

# -------------------- Pipeline Orchestration --------------------
class DocumentProcessingPipeline:
    def __init__(
        self,
        file_path,
        title_option: Literal['fontbased', 'semantic', 'candidate'] = 'fontbased',
        persist_db: bool = True,
        top_titles: int = 20,
        extractor: Optional[DocumentExtractor] = None,
        extractor_config: dict = None,
        query_constructor: Optional[QueryConstructorLM] = None,
        query_constructor_config: dict = None,
        text_embedder: Optional[CLIPTextEmbedder] = None,
        text_embedder_config: dict = None,
        image_embedder: Optional[ImageEmbedder] = None,
        image_embedder_config: dict = None,
        chunker: Optional[AdaptiveChunker] = None,
        chunker_config: dict = None,
        db_manager: Optional[ChromaDBManager] = None,
        db_manager_config: dict = None,
        summarizer: Optional[Summarizer] = None,
        summarizer_config: dict = None,
    ):
        self.file_path = file_path
        self.title_option = title_option
        self.top_titles = top_titles

        self.extractor = extractor or DocumentExtractor(**(extractor_config or {}))
        self.query_constructor = query_constructor or QueryConstructorLM(**(query_constructor_config or {}))
        self.text_embedder = text_embedder or CLIPTextEmbedder(**(text_embedder_config or {}))
        self.image_embedder = image_embedder or ImageEmbedder(**(image_embedder_config or {}))
        self.chunker = chunker or AdaptiveChunker(**(chunker_config or {}))
        self.db_manager = db_manager or (ChromaDBManager(**(db_manager_config or {})) if persist_db else None)
        self.summarizer = summarizer or Summarizer(**(summarizer_config or {}))

    def run(self):
          print(f"Extracting content from {self.file_path}")

          raw_text, images, tables = self.extractor.extract(self.file_path)
          raw_text_cp = raw_text.copy()
          raw_text =  [item["content"] for item in raw_text]
          raw_text = "".join(raw_text)

          images =  [item["image"] for item in images]



          # lines = [line.strip() for line in raw_text.split('\n') if line.strip()]       # for chunking of lines
          chunks = self.chunker.chunk_text(raw_text_cp)
          lines = [item['text'] for item in chunks]                                     # for Recursive(Adaptive) chunks



          self.title_extractor = TitleExtractor(lines=lines, full_text=raw_text)
          titles = self.title_extractor.extract_titles(
              option=self.title_option,
              pdf_path=self.file_path,
              top_n=self.top_titles
          )



          # constructed_queries = self.query_constructor.reconstruct(
          #     [title for title in titles]
          # )




          # Embed all content
          text_embeddings = self.text_embedder.encode(lines)
          image_embeddings = self.image_embedder.encode(images) if images else []
          title_embeddings = self.text_embedder.encode(titles)

          # Store in ChromaDB if enabled
          if self.db_manager:
              self.db_manager.store_texts(lines, text_embeddings)
              if images:
                  image_indices = list(range(len(images)))
                  self.db_manager.store_images(image_indices, image_embeddings)

          return {
              "titles": [title.capitalize() for title in titles],
              # "constructed_queries": constructed_queries,
              "title_embeddings": title_embeddings,
              "lines": lines,
              "images": images,
              "db_manager": self.db_manager,
              "text_embeddings": text_embeddings,
              "image_embeddings": image_embeddings,
              "text_embedder": self.text_embedder,
              "image_embedder": self.image_embedder,
              "title_extractor": self.title_extractor,
              "title_option": self.title_option,
              "raw_text": raw_text,
              "tables": tables,
              "chunks": chunks,
              "summarizer": self.summarizer

          }

In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Presentation Builder

In [6]:
# very good deep
import os
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from PIL import Image
import textwrap

class PresentationBuilder:
    def __init__(self, title="Presentation Title", template_path=None, font_name="Arial", font_size=20):
        self.title = title
        self.presentation = Presentation(template_path) if template_path else Presentation()

        # Set standard 16:9 size (13.333" x 7.5") if no template
        if not template_path:
            self.presentation.slide_width = Inches(13.333)
            self.presentation.slide_height = Inches(7.5)

        self.font_name = font_name
        self.font_size = Pt(font_size)
        self.title_font_size = Pt(font_size + 10)
        self.slide_width = self.presentation.slide_width
        self.slide_height = self.presentation.slide_height

        # Margins for 16:9 slides
        self.left_margin = Inches(1.0)
        self.right_margin = Inches(1.0)
        self.top_margin = Inches(0.5)
        self.bottom_margin = Inches(0.5)

        # Increased title area for image-only slides
        self.title_height = Inches(1.5)  # Increased from 1.0 to prevent overlap

        self._add_title_slide()

    def _add_title_slide(self):
        slide_layout = self.presentation.slide_layouts[0]
        slide = self.presentation.slides.add_slide(slide_layout)
        title = slide.shapes.title
        title.text = self.title
        title.text_frame.paragraphs[0].font.size = Pt(44)
        title.text_frame.paragraphs[0].font.name = self.font_name
        title.text_frame.paragraphs[0].font.bold = True

    def _calculate_text_height(self, bullets):
        if not bullets:
            return Inches(0)

        avg_line_height = self.font_size.pt * 1.2 / 72  # in inches
        total_lines = sum(max(1, len(textwrap.wrap(bullet, width=70))) for bullet in bullets)
        return Inches(total_lines * avg_line_height + len(bullets) * 0.1)

    def _add_bullet_points(self, slide, bullets):
        left = self.left_margin
        top = self.top_margin + self.title_height
        width = self.slide_width - self.left_margin - self.right_margin
        height = self._calculate_text_height(bullets)

        textbox = slide.shapes.add_textbox(left, top, width, height)
        tf = textbox.text_frame
        tf.word_wrap = True

        for idx, bullet in enumerate(bullets):
            p = tf.add_paragraph() if idx > 0 else tf.paragraphs[0]
            p.text = bullet
            p.level = 0
            p.font.name = self.font_name
            p.font.size = self.font_size
            p.space_after = Pt(12)

        return top + height

    def _calculate_image_size(self, img, max_width, max_height):
        img_width, img_height = img.size
        img_ratio = img_width / img_height

        max_width_px = max_width * 96
        max_height_px = max_height * 96

        if (max_width_px / img_ratio) <= max_height_px:
            width = max_width_px
            height = width / img_ratio
        else:
            height = max_height_px
            width = height * img_ratio

        return Inches(width / 96), Inches(height / 96)

    def _add_images_to_slide(self, slide, images, content_bottom, slide_title=""):
        available_width = (self.slide_width - self.left_margin - self.right_margin) / Inches(1)
        available_height = (self.slide_height - content_bottom - self.bottom_margin) / Inches(1)

        images_added = 0
        current_top = content_bottom + Inches(0.2)

        for idx, img in enumerate(images):
            width, height = self._calculate_image_size(img, available_width, available_height)

            if current_top + height > self.slide_height - self.bottom_margin:
                self._create_image_slides(images[idx:], slide_title)
                break

            left_pos = (self.slide_width - width) / 2

            temp_path = f"temp_img_{idx}.png"
            try:
                img.save(temp_path)
                slide.shapes.add_picture(
                    temp_path,
                    left_pos,
                    current_top,
                    width,
                    height
                )
                images_added += 1
                current_top += height + Inches(0.2)
            except Exception as e:
                print(f"Error adding image: {e}")
            finally:
                if os.path.exists(temp_path):
                    os.remove(temp_path)

        return images_added

    def _create_image_slides(self, images, base_title):
        """Create dedicated slides for images with expanded title area"""
        # Calculate available space for image (with increased title area)
        content_top = self.top_margin + self.title_height
        max_width = (self.slide_width - self.left_margin - self.right_margin) / Inches(1)
        max_height = (self.slide_height - content_top - self.bottom_margin) / Inches(1)

        for idx, img in enumerate(images):
            slide = self.presentation.slides.add_slide(self.presentation.slide_layouts[5])

            # Add title with expanded space
            title = f"{base_title} - Image {idx+1}" if base_title else f"Image {idx+1}"
            self._add_expanded_slide_title(slide, title)

            # Calculate image size
            width, height = self._calculate_image_size(img, max_width, max_height)

            # Center image in remaining space
            left = (self.slide_width - width) / 2
            top = content_top + (self.slide_height - content_top - height - self.bottom_margin) / 2

            # Add image
            temp_path = f"temp_full_img_{idx}.png"
            try:
                img.save(temp_path)
                slide.shapes.add_picture(
                    temp_path,
                    left,
                    top,
                    width,
                    height
                )
            except Exception as e:
                print(f"Error adding full image: {e}")
            finally:
                if os.path.exists(temp_path):
                    os.remove(temp_path)

    def _add_expanded_slide_title(self, slide, title):
        """Add title with more vertical space on image-only slides"""
        title_shape = slide.shapes.add_textbox(
            left=self.left_margin,
            top=self.top_margin,
            width=self.slide_width - self.left_margin - self.right_margin,
            height=self.title_height  # Using the increased height
        )
        tf = title_shape.text_frame
        p = tf.paragraphs[0]
        p.text = title
        p.font.name = self.font_name
        p.font.size = self.title_font_size
        p.font.bold = True
        p.alignment = 1  # Center aligned

    def _add_slide_title(self, slide, title):
        if hasattr(slide.shapes, 'title'):
            title_shape = slide.shapes.title
            title_shape.text = title
            tf = title_shape.text_frame
            p = tf.paragraphs[0]
            p.font.name = self.font_name
            p.font.size = self.title_font_size
            p.font.bold = True
        else:
            self._add_expanded_slide_title(slide, title)

    def add_slide(self, slide_data):
        title = slide_data.get("title", "")
        bullets = slide_data.get("bullets", [])
        images = slide_data.get("images", [])

        pil_images = []
        for img in images:
            if isinstance(img, str):
                try:
                    pil_images.append(Image.open(img))
                except Exception as e:
                    print(f"Error loading image {img}: {e}")
            elif isinstance(img, Image.Image):
                pil_images.append(img)

        layout = 1 if bullets else 5
        slide = self.presentation.slides.add_slide(self.presentation.slide_layouts[layout])
        self._add_slide_title(slide, title)

        content_bottom = self.top_margin + self.title_height
        if bullets:
            content_bottom = self._add_bullet_points(slide, bullets)

        if pil_images:
            added = self._add_images_to_slide(slide, pil_images, content_bottom, title)
            # print(f"Added {added} image(s) to slide")

    def build_from_list(self, slides):
        for slide_data in slides:
            self.add_slide(slide_data)

    def save(self, filename="output_presentation.pptx"):
        self.presentation.save(filename)
        print(f"Presentation saved to {filename}")

## Example

#### Extract titles Phase

In [7]:
# # Usage example - now both will be 512-dimensional
# file_path = "/kaggle/input/sample1/2107.07382v1.pdf" # or "your_document.docx"


# doc_pipeline = DocumentProcessingPipeline(file_path, "semantic")
# results = doc_pipeline.run()

# # Both text_embeddings and title_embeddings will now be 512-dimensional from CLIP
# title_embeddings = doc_pipeline.text_embedder.encode(results["titles"])
# print(f"Embedded {len(results['titles'])} titles.")



#### Slides JSON Format for Selected Titles

In [8]:
# slide_builder = SlideBuilder(
#     titles=[title.capitalize() for title in results["titles"]],
#     # titles=results["constructed_queries"],
#     lines=results["lines"],
#     images=results["images"],
#     db_manager=results["db_manager"],
#     # text_embedder = results["text_embedder"],
#     title_embeddings = results["title_embeddings"],
#     summarizer = results["summarizer"],
#     # title_embeddings = results["text_embedder"].encode(results["constructed_queries"]),
# )

# slides = slide_builder.build_slides(summarization_strategy="cot", max_image_occurrences=1)

#### Build the pptx file using the slides generated

In [9]:
# builder = PresentationBuilder(
#         title="LLM survey",
#         font_name="Arial",
#         font_size=20,
#         # template_path="/kaggle/working/template_2.pptx"
#     )

# builder.build_from_list(slides)
# builder.save("/kaggle/working/LLM_survey.pptx")

# GA

In [10]:
import random
import copy
from typing import List, Dict, Any, Tuple
from collections import defaultdict

import tempfile
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
from pptx.enum.shapes import MSO_SHAPE_TYPE
from pptx.dml.color import RGBColor
from pptx.enum.dml import MSO_THEME_COLOR

class SlideLayout:
    """Represents a single slide layout (chromosome)"""

    def __init__(self, genes: Dict[str, Any] = None):
        if genes is None:
            self.genes = self._generate_random_genes()
        else:
            self.genes = genes
        self.fitness_score = 0

    def _generate_random_genes(self) -> Dict[str, Any]:
        """Generate random layout genes"""
        return {
            "title_position": random.choice(["top", "center", "left", "right"]),
            "bullet_columns": random.choice([1, 2]),
            "bullet_font_size": random.randint(16, 36),
            "image_position": random.choice(["left", "right", "top", "bottom", "grid", "none"]),
            "image_size": random.choice(["small", "medium", "large"]),
            "image_layout": random.choice(["single", "grid", "horizontal", "vertical"]),
            "theme": random.choice(["default", "dark", "light", "modern"]),
            "margin_size": random.choice(["small", "medium", "large"]),
            "title_font_size": random.randint(24, 48),
            "background_color": random.choice(["white", "light_gray", "dark_blue", "light_blue"]),
            "text_color": random.choice(["black", "white", "dark_gray", "blue"]),
            "image_text_balance": random.choice(["text_heavy", "balanced", "image_heavy"])
        }

    def mutate(self, mutation_rate: float = 0.1):
        """Mutate the layout genes"""
        gene_options = {
            "title_position": ["top", "center", "left", "right"],
            "bullet_columns": [1, 2],
            "bullet_font_size": list(range(16, 37)),
            "image_position": ["left", "right", "top", "bottom", "grid", "none"],
            "image_size": ["small", "medium", "large"],
            "image_layout": ["single", "grid", "horizontal", "vertical"],
            "theme": ["default", "dark", "light", "modern"],
            "margin_size": ["small", "medium", "large"],
            "title_font_size": list(range(24, 49)),
            "background_color": ["white", "light_gray", "dark_blue", "light_blue"],
            "text_color": ["black", "white", "dark_gray", "blue"],
            "image_text_balance": ["text_heavy", "balanced", "image_heavy"]
        }

        for gene_name in self.genes:
            if random.random() < mutation_rate:
                self.genes[gene_name] = random.choice(gene_options[gene_name])

    def crossover(self, other: 'SlideLayout') -> 'SlideLayout':
        """Create offspring through crossover"""
        child_genes = {}
        for gene_name in self.genes:
            child_genes[gene_name] = random.choice([self.genes[gene_name], other.genes[gene_name]])
        return SlideLayout(child_genes)

    def __str__(self):
        return f"Layout(fitness={self.fitness_score:.2f}, genes={self.genes})"


class ImageAnalyzer:
    """Utility class for analyzing PIL Image objects"""

    @staticmethod
    def get_image_properties(image: Image.Image) -> Dict[str, Any]:
        """Extract properties from PIL Image"""
        width, height = image.size
        aspect_ratio = width / height

        return {
            'width': width,
            'height': height,
            'aspect_ratio': aspect_ratio,
            'orientation': 'landscape' if aspect_ratio > 1.2 else 'portrait' if aspect_ratio < 0.8 else 'square',
            'total_pixels': width * height,
            'is_large': (width * height) > 500000,  # > 0.5MP
            'is_wide': aspect_ratio > 1.5,
            'is_tall': aspect_ratio < 0.7
        }

    @staticmethod
    def analyze_image_collection(images: List[Image.Image]) -> Dict[str, Any]:
        """Analyze a collection of images"""
        if not images:
            return {
                'count': 0,
                'has_images': False,
                'mixed_orientations': False,
                'avg_aspect_ratio': 1.0,
                'total_area': 0,
                'complexity_score': 0
            }

        properties = [ImageAnalyzer.get_image_properties(img) for img in images]
        orientations = [prop['orientation'] for prop in properties]
        aspect_ratios = [prop['aspect_ratio'] for prop in properties]
        total_pixels = sum(prop['total_pixels'] for prop in properties)

        return {
            'count': len(images),
            'has_images': True,
            'mixed_orientations': len(set(orientations)) > 1,
            'avg_aspect_ratio': np.mean(aspect_ratios),
            'dominant_orientation': max(set(orientations), key=orientations.count),
            'total_area': total_pixels,
            'has_large_images': any(prop['is_large'] for prop in properties),
            'complexity_score': len(images) + (0.5 if len(set(orientations)) > 1 else 0)
        }


class SlideContent:
    """Represents slide content with PIL Images"""

    def __init__(self, title: str, bullets: List[str], images: List[Image.Image]):
        self.title = title
        self.bullets = bullets
        self.images = images
        self.word_count = len(title.split()) + sum(len(bullet.split()) for bullet in bullets)

        # Analyze images
        self.image_analysis = ImageAnalyzer.analyze_image_collection(images)

        # Content density metrics
        self.content_density = self._calculate_content_density()
        self.text_image_ratio = self._calculate_text_image_ratio()

    def _calculate_content_density(self) -> float:
        """Calculate overall content density"""
        text_density = self.word_count / 100.0  # Normalize
        image_density = self.image_analysis['complexity_score']
        return text_density + image_density

    def _calculate_text_image_ratio(self) -> str:
        """Determine if content is text-heavy, image-heavy, or balanced"""
        if not self.images:
            return "text_only"
        elif len(self.images) >= 3 or self.image_analysis['total_area'] > 1000000:
            return "image_heavy"
        elif self.word_count > 100:
            return "text_heavy"
        else:
            return "balanced"


class FitnessEvaluator:
    """Handles fitness evaluation for slide layouts"""

    def __init__(self):
        self.color_contrast_map = {
            ("black", "white"): 1.0,
            ("black", "light_gray"): 0.8,
            ("white", "dark_blue"): 0.9,
            ("white", "light_blue"): 0.6,
            ("dark_gray", "white"): 0.9,
            ("blue", "white"): 0.8,
            ("black", "dark_blue"): 0.3,  # Poor contrast
            ("white", "light_gray"): 0.4   # Poor contrast
        }

        # Slide dimensions (standard 16:9 aspect ratio) in inches
        self.slide_width = 10.0  # inches
        self.slide_height = 7.5  # inches

        # Margin size mappings (in inches)
        self.margin_sizes = {
            'small': 0.3,
            'medium': 0.5,
            'large': 0.8
        }

        # Font size to height conversion (approximate)
        self.font_height_ratio = 1.2  # font height = font_size * ratio
        self.points_to_inches = 1/72  # 72 points = 1 inch

        # Position mappings for layout calculation
        self.title_positions = {
            'top': {'x': 0.5, 'y': 0.1},  # centered at top
            'center': {'x': 0.5, 'y': 0.5},  # centered
            'left': {'x': 0.2, 'y': 0.2},  # left side
            'right': {'x': 0.8, 'y': 0.2}  # right side
        }

        # Image position boundaries
        self.image_positions = {
            'left': {'x': 0.0, 'y': 0.3, 'max_width': 0.4},
            'right': {'x': 0.6, 'y': 0.3, 'max_width': 0.4},
            'top': {'x': 0.1, 'y': 0.0, 'max_height': 0.4},
            'bottom': {'x': 0.1, 'y': 0.6, 'max_height': 0.4},
            'grid': {'x': 0.1, 'y': 0.3, 'max_width': 0.8, 'max_height': 0.6}
        }

        # Size mappings for images (as fraction of slide)
        self.image_size_ratios = {
            'small': 0.2,
            'medium': 0.35,
            'large': 0.5
        }

    def evaluate_fitness(self, layout: SlideLayout, content: SlideContent,
                        previous_layout: SlideLayout = None) -> float:
        """Comprehensive fitness evaluation"""
        score = 100  # Start with base score

        # 1. Text Readability
        score += self._evaluate_text_readability(layout, content)

        # 2. Bullet List Management
        score += self._evaluate_bullet_management(layout, content)

        # 3. Image Handling
        score += self._evaluate_image_handling_advanced(layout, content)

        # 4. Visual Balance
        score += self._evaluate_visual_balance_advanced(layout, content)

        # 5. Color Contrast
        score += self._evaluate_color_contrast(layout)

        # 6. Consistency
        score += self._evaluate_consistency(layout, previous_layout)

        # 7. Content Appropriateness
        score += self._evaluate_content_appropriateness(layout, content)

        # 8. Accessibility
        score += self._evaluate_accessibility(layout)

        # 9. Image-Text Balance
        score += self._evaluate_image_text_balance(layout, content)

        # 10. Boundary Constraints
        score += self._evaluate_boundary_constraints(layout, content)

        return max(0, score)  # Ensure non-negative score

    def _evaluate_text_readability(self, layout: SlideLayout, content: SlideContent) -> float:
        """Evaluate text readability factors"""
        score = 0

        # Bullet font size
        bullet_size = layout.genes['bullet_font_size']
        if 16 <= bullet_size <= 28:
            score += 10
        elif 13 <= bullet_size <= 32:
            score += 5
        else:
            score -= 15

        # Title font size
        title_size = layout.genes['title_font_size']
        if 30 <= title_size <= 40:
            score += 8
        elif 24 <= title_size <= 44:
            score += 4
        else:
            score -= 10

        # Font size relationship (title should be larger than bullets)
        if title_size > bullet_size:
            score += 5
        else:
            score -= 8

        return score

    def _evaluate_bullet_management(self, layout: SlideLayout, content: SlideContent) -> float:
        """Evaluate bullet point organization"""
        score = 0
        num_bullets = len(content.bullets)
        columns = layout.genes['bullet_columns']

        # Optimal column usage
        if num_bullets <= 4:
            if columns == 1:
                score += 10
            else:
                score -= 5
        elif 5 <= num_bullets <= 8:
            if columns == 2:
                score += 12
            else:
                score -= 3
        else:  # >8 bullets
            if columns == 2:
                score += 8
            else:
                score -= 10

        # Bullet density penalty
        avg_bullet_length = np.mean([len(bullet.split()) for bullet in content.bullets])
        if avg_bullet_length > 15 and columns == 1:
            score -= 8

        # Too many bullets penalty
        if num_bullets > 7:
            score -= (num_bullets - 7) * 2

        return score

    def _evaluate_image_handling_advanced(self, layout: SlideLayout, content: SlideContent) -> float:
        """Advanced image handling evaluation using PIL image analysis"""
        score = 0
        img_analysis = content.image_analysis

        # Basic image presence handling
        if img_analysis['has_images'] and layout.genes['image_position'] != "none":
            score += 15
        elif not img_analysis['has_images'] and layout.genes['image_position'] == "none":
            score += 10
        elif not img_analysis['has_images'] and layout.genes['image_position'] != "none":
            score -= 25
        else:
            score -= 15

        # Advanced image-specific evaluations
        if img_analysis['has_images']:
            image_count = img_analysis['count']

            # Image layout appropriateness
            if image_count == 1:
                if layout.genes['image_layout'] == "single":
                    score += 10
                else:
                    score -= 5
            elif image_count == 2:
                if layout.genes['image_layout'] in ["horizontal", "vertical"]:
                    score += 8
                else:
                    score -= 3
            elif image_count >= 3:
                if layout.genes['image_layout'] == "grid":
                    score += 12
                else:
                    score -= 6

            # Image size based on orientation and count
            if img_analysis['dominant_orientation'] == 'landscape':
                if layout.genes['image_position'] in ["top", "bottom"]:
                    score += 6
                elif layout.genes['image_position'] in ["left", "right"] and layout.genes['image_size'] != "large":
                    score += 4
            elif img_analysis['dominant_orientation'] == 'portrait':
                if layout.genes['image_position'] in ["left", "right"]:
                    score += 6
                elif layout.genes['image_position'] in ["top", "bottom"] and layout.genes['image_size'] == "small":
                    score += 4

            # Mixed orientations penalty for simple layouts
            if img_analysis['mixed_orientations'] and layout.genes['image_layout'] != "grid":
                score -= 8

            # Large images consideration
            if img_analysis['has_large_images']:
                if layout.genes['image_size'] == "large" and len(content.bullets) <= 3:
                    score += 8
                elif layout.genes['image_size'] == "small":
                    score -= 6

        return score

    def _evaluate_visual_balance_advanced(self, layout: SlideLayout, content: SlideContent) -> float:
        """Enhanced visual balance considering actual image properties"""
        score = 0
        img_analysis = content.image_analysis

        # Basic margin and spacing
        margin = layout.genes['margin_size']
        content_density = content.content_density

        if content_density <= 2:
            if margin in ["medium", "large"]:
                score += 8
        elif content_density <= 5:
            if margin == "medium":
                score += 10
        else:
            if margin == "small":
                score += 6
            else:
                score -= 5

        # Image-text balance based on actual content
        balance = layout.genes['image_text_balance']
        actual_ratio = content.text_image_ratio

        if actual_ratio == "text_only" and balance == "text_heavy":
            score += 10
        elif actual_ratio == "image_heavy" and balance == "image_heavy":
            score += 12
        elif actual_ratio == "balanced" and balance == "balanced":
            score += 15
        else:
            score -= 8

        # Title position considering image presence and position
        title_pos = layout.genes['title_position']
        if img_analysis['has_images']:
            img_pos = layout.genes['image_position']
            if title_pos == "top" and img_pos in ["left", "right", "bottom"]:
                score += 8
            elif title_pos == "center" and img_pos in ["top", "bottom"]:
                score += 6
        else:
            if title_pos == "top":
                score += 8

        return score

    def _evaluate_color_contrast(self, layout: SlideLayout) -> float:
        """Evaluate color contrast for readability"""
        score = 0

        text_color = layout.genes['text_color']
        bg_color = layout.genes['background_color']

        contrast_key = (text_color, bg_color)
        if contrast_key in self.color_contrast_map:
            contrast_ratio = self.color_contrast_map[contrast_key]
            if contrast_ratio >= 0.8:
                score += 12
            elif contrast_ratio >= 0.6:
                score += 6
            else:
                score -= 15
        else:
            # Unknown combination, assume poor contrast
            score -= 10

        return score

    def _evaluate_consistency(self, layout: SlideLayout, previous_layout: SlideLayout) -> float:
        """Evaluate consistency with previous slide"""
        if previous_layout is None:
            return 0

        score = 0

        # Theme consistency
        if layout.genes['theme'] == previous_layout.genes['theme']:
            score += 10

        # Title position consistency
        if layout.genes['title_position'] == previous_layout.genes['title_position']:
            score += 5

        # Color scheme consistency
        if (layout.genes['background_color'] == previous_layout.genes['background_color'] and
            layout.genes['text_color'] == previous_layout.genes['text_color']):
            score += 6

        return score

    def _evaluate_content_appropriateness(self, layout: SlideLayout, content: SlideContent) -> float:
        """Evaluate if layout matches content type and density"""
        score = 0

        # Content density vs layout complexity
        total_content = len(content.title.split()) + sum(len(b.split()) for b in content.bullets)

        if total_content < 30:  # Light content
            if layout.genes['margin_size'] in ["medium", "large"]:
                score += 6
        elif total_content > 80:  # Heavy content
            if (layout.genes['bullet_columns'] == 2 and
                layout.genes['margin_size'] == "small"):
                score += 8
            else:
                score -= 5

        # Title length vs title font size
        title_words = len(content.title.split())
        title_font = layout.genes['title_font_size']

        if title_words > 6 and title_font > 36:
            score -= 6  # Large font for long title
        elif title_words <= 3 and title_font >= 32:
            score += 4  # Good emphasis for short title

        return score

    def _evaluate_accessibility(self, layout: SlideLayout) -> float:
        """Evaluate accessibility considerations"""
        score = 0

        # Minimum font sizes for accessibility
        if layout.genes['bullet_font_size'] >= 16:
            score += 5
        if layout.genes['title_font_size'] >= 28:
            score += 3

        # High contrast combinations get bonus
        text_color = layout.genes['text_color']
        bg_color = layout.genes['background_color']

        high_contrast_pairs = [
            ("black", "white"), ("white", "dark_blue"), ("dark_gray", "white")
        ]

        if (text_color, bg_color) in high_contrast_pairs:
            score += 8

        return score

    def _evaluate_image_text_balance(self, layout: SlideLayout, content: SlideContent) -> float:
        """Evaluate the balance between images and text"""
        score = 0
        img_analysis = content.image_analysis

        if not img_analysis['has_images']:
            return 0

        # Check if layout respects content type
        balance_setting = layout.genes['image_text_balance']
        actual_balance = content.text_image_ratio

        # Reward matching balance settings
        if actual_balance == "text_heavy" and balance_setting == "text_heavy":
            score += 12
        elif actual_balance == "image_heavy" and balance_setting == "image_heavy":
            score += 12
        elif actual_balance == "balanced" and balance_setting == "balanced":
            score += 15
        else:
            score -= 6

        # Consider image complexity
        complexity = img_analysis['complexity_score']
        if complexity > 3 and balance_setting != "image_heavy":
            score -= 8
        elif complexity < 1.5 and balance_setting == "image_heavy":
            score -= 5

        return score

    def _evaluate_boundary_constraints(self, layout: SlideLayout, content: SlideContent) -> float:
        """Evaluate if content stays within slide boundaries"""
        score = 0
        penalties = 0

        # Get layout parameters
        margin = self.margin_sizes[layout.genes['margin_size']]

        # Calculate usable area
        usable_width = self.slide_width - (2 * margin)
        usable_height = self.slide_height - (2 * margin)

        # 1. Check title boundaries
        title_penalty = self._check_title_boundaries(layout, content, margin, usable_width, usable_height)
        penalties += title_penalty

        # 2. Check bullet text boundaries
        bullet_penalty = self._check_bullet_boundaries(layout, content, margin, usable_width, usable_height)
        penalties += bullet_penalty

        # 3. Check image boundaries
        if content.image_analysis['has_images']:
            image_penalty = self._check_image_boundaries(layout, content, margin, usable_width, usable_height)
            penalties += image_penalty

        # Apply penalties (negative score for boundary violations)
        score -= penalties

        # Bonus for good boundary management
        if penalties == 0:
            score += 10  # Perfect boundary adherence bonus
        elif penalties < 5:
            score += 5   # Minor violations only

        return score

    def _check_title_boundaries(self, layout: SlideLayout, content: SlideContent,
                               margin: float, usable_width: float, usable_height: float) -> float:
        """Check if title stays within boundaries"""
        penalty = 0

        # Get title properties
        title_font_size = layout.genes['title_font_size']
        title_height = (title_font_size * self.font_height_ratio * self.points_to_inches)
        title_chars = len(content.title)

        # Estimate title width (rough approximation: avg 0.6 of height per character)
        title_width = title_chars * (title_font_size * 0.6 * self.points_to_inches)

        # Get title position
        title_pos = layout.genes['title_position']
        pos_data = self.title_positions[title_pos]

        # Calculate actual position
        title_x = margin + (pos_data['x'] * usable_width)
        title_y = margin + (pos_data['y'] * usable_height)

        # Check horizontal boundaries
        if title_pos in ['center', 'top']:
            # Centered text
            left_edge = title_x - (title_width / 2)
            right_edge = title_x + (title_width / 2)
        elif title_pos == 'left':
            left_edge = title_x
            right_edge = title_x + title_width
        else:  # right
            left_edge = title_x - title_width
            right_edge = title_x

        # Check if title exceeds boundaries
        if left_edge < margin:
            penalty += abs(left_edge - margin) * 10  # 10 points per inch outside
        if right_edge > (self.slide_width - margin):
            penalty += abs(right_edge - (self.slide_width - margin)) * 10

        # Check vertical boundaries
        if title_y + title_height > (self.slide_height - margin):
            penalty += abs(title_y + title_height - (self.slide_height - margin)) * 10

        # Extra penalty for very long titles with large fonts
        if title_width > usable_width * 0.9 and title_font_size > 36:
            penalty += 15  # Encourage smaller font for long titles

        return penalty

    def _check_bullet_boundaries(self, layout: SlideLayout, content: SlideContent,
                                margin: float, usable_width: float, usable_height: float) -> float:
        """Check if bullet points stay within boundaries"""
        penalty = 0

        if not content.bullets:
            return 0

        # Get bullet properties
        bullet_font_size = layout.genes['bullet_font_size']
        bullet_height = (bullet_font_size * self.font_height_ratio * self.points_to_inches)
        num_columns = layout.genes['bullet_columns']

        # Calculate space needed for bullets
        bullets_per_column = len(content.bullets) / num_columns
        total_bullet_height = bullets_per_column * bullet_height * 1.5  # 1.5 for line spacing

        # Reserve space for title
        title_space = 1.5  # inches
        available_height = usable_height - title_space

        # Check if bullets exceed vertical space
        if total_bullet_height > available_height:
            penalty += (total_bullet_height - available_height) * 15

        # Check horizontal space for each bullet
        column_width = usable_width / num_columns

        for bullet in content.bullets:
            # Estimate bullet width
            bullet_chars = len(bullet)
            bullet_width = bullet_chars * (bullet_font_size * 0.5 * self.points_to_inches)

            # Check if bullet exceeds column width
            if bullet_width > column_width * 0.95:  # 95% to leave some padding
                penalty += (bullet_width - column_width * 0.95) * 8

                # Extra penalty for very long bullets in single column
                if num_columns == 1 and bullet_chars > 100:
                    penalty += 10

        return penalty

    def _check_image_boundaries(self, layout: SlideLayout, content: SlideContent,
                              margin: float, usable_width: float, usable_height: float) -> float:
        """Check if images stay within boundaries"""
        penalty = 0

        if not content.image_analysis['has_images']:
            return 0

        image_position = layout.genes['image_position']
        image_size = layout.genes['image_size']
        image_layout_type = layout.genes['image_layout']

        if image_position == 'none':
            return 0

        # Get image area constraints
        if image_position in self.image_positions:
            pos_constraints = self.image_positions[image_position]
        else:
            pos_constraints = {'x': 0.1, 'y': 0.3, 'max_width': 0.8, 'max_height': 0.6}

        # Calculate image dimensions based on size
        size_ratio = self.image_size_ratios[image_size]

        # Check based on layout type
        if image_layout_type == 'single' and len(content.images) > 0:
            # Single image
            img = content.images[0]
            img_props = ImageAnalyzer.get_image_properties(img)

            # Calculate scaled dimensions
            if image_position in ['left', 'right']:
                max_img_width = usable_width * pos_constraints.get('max_width', 0.4)
                max_img_height = usable_height * 0.7  # 70% of usable height

                # Calculate width and height based on aspect ratio
                if img_props['aspect_ratio'] > 1:  # Landscape
                    img_width = max_img_width
                    img_height = img_width / img_props['aspect_ratio']
                    if img_height > max_img_height:
                        img_height = max_img_height
                        img_width = img_height * img_props['aspect_ratio']
                else:  # Portrait
                    img_height = max_img_height
                    img_width = img_height * img_props['aspect_ratio']
                    if img_width > max_img_width:
                        img_width = max_img_width
                        img_height = img_width / img_props['aspect_ratio']

                # Apply size ratio
                img_width *= size_ratio * 1.5  # Adjust based on size setting
                img_height *= size_ratio * 1.5

                # Check if exceeds boundary
                if img_width > max_img_width:
                    penalty += (img_width - max_img_width) * 12
                if img_height > max_img_height:
                    penalty += (img_height - max_img_height) * 12

            elif image_position in ['top', 'bottom']:
                max_img_width = usable_width * 0.8  # 80% of usable width
                max_img_height = usable_height * pos_constraints.get('max_height', 0.4)

                # Similar logic for top/bottom positions
                if img_props['aspect_ratio'] > 1:  # Landscape
                    img_width = max_img_width
                    img_height = img_width / img_props['aspect_ratio']
                    if img_height > max_img_height:
                        img_height = max_img_height
                        img_width = img_height * img_props['aspect_ratio']
                else:  # Portrait
                    img_height = max_img_height
                    img_width = img_height * img_props['aspect_ratio']
                    if img_width > max_img_width:
                        img_width = max_img_width
                        img_height = img_width / img_props['aspect_ratio']

                # Apply size ratio
                img_width *= size_ratio * 1.5
                img_height *= size_ratio * 1.5

                # Check if exceeds boundary
                if img_width > max_img_width:
                    penalty += (img_width - max_img_width) * 12
                if img_height > max_img_height:
                    penalty += (img_height - max_img_height) * 12

        elif image_layout_type == 'grid' and content.images:
            # Grid layout
            num_images = len(content.images)
            cols = min(int(num_images**0.5) + 1, 3)  # Estimate grid columns
            rows = (num_images + cols - 1) // cols     # Ceiling division for rows

            # Calculate available space
            max_grid_width = usable_width * pos_constraints.get('max_width', 0.8)
            max_grid_height = usable_height * pos_constraints.get('max_height', 0.6)

            # Calculate per-image dimensions
            cell_width = max_grid_width / cols
            cell_height = max_grid_height / rows

            # Calculate approximate image sizes
            for img in content.images:
                img_props = ImageAnalyzer.get_image_properties(img)

                # Calculate dimensions based on aspect ratio and size
                if img_props['aspect_ratio'] > 1:  # Landscape
                    img_width = cell_width * 0.9  # 90% of cell width
                    img_height = img_width / img_props['aspect_ratio']
                else:  # Portrait
                    img_height = cell_height * 0.9  # 90% of cell height
                    img_width = img_height * img_props['aspect_ratio']

                # Apply size ratio
                img_width *= size_ratio
                img_height *= size_ratio

                # Check if any image exceeds its cell
                if img_width > cell_width:
                    penalty += (img_width - cell_width) * 8
                if img_height > cell_height:
                    penalty += (img_height - cell_height) * 8

        elif image_layout_type in ['horizontal', 'vertical'] and len(content.images) > 1:
            # Horizontal or vertical layout
            num_images = len(content.images)

            if image_layout_type == 'horizontal':
                # Images side by side
                max_total_width = usable_width * 0.9  # 90% of usable width
                width_per_image = max_total_width / num_images
                max_height = usable_height * 0.4  # 40% of usable height

                for img in content.images:
                    img_props = ImageAnalyzer.get_image_properties(img)

                    # Calculate dimensions
                    img_width = width_per_image * 0.9  # 90% of allocated width
                    img_height = img_width / img_props['aspect_ratio']

                    # Apply size ratio
                    img_width *= size_ratio
                    img_height *= size_ratio

                    # Check vertical overflow
                    if img_height > max_height:
                        penalty += (img_height - max_height) * 10

            else:  # vertical
                # Images stacked
                max_total_height = usable_height * 0.8  # 80% of usable height
                height_per_image = max_total_height / num_images
                max_width = usable_width * 0.4  # 40% of usable width

                for img in content.images:
                    img_props = ImageAnalyzer.get_image_properties(img)

                    # Calculate dimensions
                    img_height = height_per_image * 0.9  # 90% of allocated height
                    img_width = img_height * img_props['aspect_ratio']

                    # Apply size ratio
                    img_width *= size_ratio
                    img_height *= size_ratio

                    # Check horizontal overflow
                    if img_width > max_width:
                        penalty += (img_width - max_width) * 10

        # Apply extra penalty for large images with small margins
        if image_size == 'large' and layout.genes['margin_size'] == 'small':
            penalty += 8  # Discourage this combination

        # Extra penalty for trying to fit too many images in a small area
        if len(content.images) > 4 and image_size != 'small':
            penalty += (len(content.images) - 4) * 5

        return penalty


class GeneticAlgorithmLayoutOptimizer:
    """Enhanced GA class for optimizing slide layouts"""

    def __init__(self, population_size: int = 50, generations: int = 100,
                 mutation_rate: float = 0.1, elite_size: int = 10):
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate
        self.elite_size = elite_size
        self.fitness_evaluator = FitnessEvaluator()
        self.best_layouts_history = []

    def optimize_slide_layouts(self, slides_content: List[SlideContent]) -> List[SlideLayout]:
        """Optimize layouts for multiple slides """
        optimized_layouts = []

        for i, slide_content in enumerate(slides_content):
            print(f"Optimizing layout for slide {i+1}/{len(slides_content)}: '{slide_content.title[:40]}...'")
            print(f"  Content: {len(slide_content.bullets)} bullets, {len(slide_content.images)} images")

            previous_layout = optimized_layouts[-1] if optimized_layouts else None
            best_layout = self._optimize_single_slide(slide_content, previous_layout)
            optimized_layouts.append(best_layout)

            print(f"  Best fitness: {best_layout.fitness_score:.2f}")

        return optimized_layouts

    def _optimize_single_slide(self, content: SlideContent,
                              previous_layout: SlideLayout = None) -> SlideLayout:
        """Optimize layout for a single slide"""

        # Initialize population with image-aware constraints
        population = self._initialize_smart_population(content)

        best_fitness_history = []

        for generation in range(self.generations):
            # Evaluate fitness
            for layout in population:
                layout.fitness_score = self.fitness_evaluator.evaluate_fitness(
                    layout, content, previous_layout
                )

            # Sort by fitness (descending)
            population.sort(key=lambda x: x.fitness_score, reverse=True)

            # Track progress
            best_fitness_history.append(population[0].fitness_score)

            if generation % 25 == 0:
                print(f"    Generation {generation}: Best fitness = {population[0].fitness_score:.2f}")

            # Create next generation
            new_population = []

            # Keep elite
            new_population.extend(copy.deepcopy(population[:self.elite_size]))

            # Generate offspring
            while len(new_population) < self.population_size:
                parent1 = self._tournament_selection(population)
                parent2 = self._tournament_selection(population)

                child = parent1.crossover(parent2)
                child.mutate(self.mutation_rate)

                # Apply image-specific constraints
                self._apply_image_constraints(child, content)

                new_population.append(child)

            population = new_population

        # Final evaluation
        for layout in population:
            layout.fitness_score = self.fitness_evaluator.evaluate_fitness(
                layout, content, previous_layout
            )

        population.sort(key=lambda x: x.fitness_score, reverse=True)
        return population[0]

    def _initialize_smart_population(self, content: SlideContent) -> List[SlideLayout]:
        """Initialize population with image-aware heuristics"""
        population = []
        img_analysis = content.image_analysis

        # Generate some layouts with smart defaults
        smart_layouts_count = self.population_size // 3

        for _ in range(smart_layouts_count):
            layout = SlideLayout()

            # Apply image-based heuristics
            if img_analysis['has_images']:
                if img_analysis['count'] == 1:
                    layout.genes['image_layout'] = 'single'
                    layout.genes['image_position'] = random.choice(['left', 'right', 'top'])
                elif img_analysis['count'] == 2:
                    layout.genes['image_layout'] = random.choice(['horizontal', 'vertical'])
                elif img_analysis['count'] >= 3:
                    layout.genes['image_layout'] = 'grid'
                    layout.genes['image_position'] = 'grid'

                # Set balance based on content analysis
                layout.genes['image_text_balance'] = content.text_image_ratio

                # Adjust image size based on orientation
                if img_analysis['dominant_orientation'] == 'landscape':
                    layout.genes['image_size'] = random.choice(['medium', 'large'])
                else:
                    layout.genes['image_size'] = random.choice(['small', 'medium'])
            else:
                layout.genes['image_position'] = 'none'
                layout.genes['image_text_balance'] = 'text_heavy'

            population.append(layout)

        # Fill rest with random layouts
        while len(population) < self.population_size:
            population.append(SlideLayout())

        return population

    def _apply_image_constraints(self, layout: SlideLayout, content: SlideContent):
        """Apply constraints based on image analysis"""
        img_analysis = content.image_analysis

        # If no images, force image position to none
        if not img_analysis['has_images']:
            layout.genes['image_position'] = 'none'
            layout.genes['image_text_balance'] = 'text_heavy'
        else:
            # Ensure image layout matches image count
            if img_analysis['count'] == 1 and layout.genes['image_layout'] != 'single':
                layout.genes['image_layout'] = 'single'
            elif img_analysis['count'] >= 3 and layout.genes['image_layout'] not in ['grid']:
                layout.genes['image_layout'] = 'grid'

    def _tournament_selection(self, population: List[SlideLayout],
                             tournament_size: int = 5) -> SlideLayout:
        """Tournament selection for parent selection"""
        tournament = random.sample(population, min(tournament_size, len(population)))
        return max(tournament, key=lambda x: x.fitness_score)

    def get_top_layouts(self, content: SlideContent, top_k: int = 3,
                       previous_layout: SlideLayout = None) -> List[SlideLayout]:
        """Get top K layout suggestions for a slide"""

        population = self._initialize_smart_population(content)
        population.extend([SlideLayout() for _ in range(self.population_size)])

        for generation in range(self.generations // 2):
            for layout in population:
                layout.fitness_score = self.fitness_evaluator.evaluate_fitness(
                    layout, content, previous_layout
                )

            population.sort(key=lambda x: x.fitness_score, reverse=True)
            population = population[:self.population_size]

            new_population = copy.deepcopy(population)

            while len(new_population) < self.population_size * 2:
                parent1 = self._tournament_selection(population)
                parent2 = self._tournament_selection(population)
                child = parent1.crossover(parent2)
                child.mutate(self.mutation_rate)
                self._apply_image_constraints(child, content)
                new_population.append(child)

            population = new_population

        for layout in population:
            layout.fitness_score = self.fitness_evaluator.evaluate_fitness(
                layout, content, previous_layout
            )

        population.sort(key=lambda x: x.fitness_score, reverse=True)
        return self._select_diverse_layouts(population, top_k)

    def _select_diverse_layouts(self, population: List[SlideLayout],
                               top_k: int) -> List[SlideLayout]:
        """Select diverse layouts from top performers"""
        selected = [population[0]]

        for layout in population[1:]:
            if len(selected) >= top_k:
                break

            is_diverse = True
            for selected_layout in selected:
                if self._layouts_too_similar(layout, selected_layout):
                    is_diverse = False
                    break

            if is_diverse:
                selected.append(layout)

        while len(selected) < top_k and len(selected) < len(population):
            for layout in population:
                if layout not in selected:
                    selected.append(layout)
                    break

        return selected

    def _layouts_too_similar(self, layout1: SlideLayout,
                            layout2: SlideLayout, threshold: float = 0.8) -> bool:
        """Check if two layouts are too similar"""
        same_genes = 0
        total_genes = len(layout1.genes)

        for gene_name in layout1.genes:
            if layout1.genes[gene_name] == layout2.genes[gene_name]:
                same_genes += 1

        similarity = same_genes / total_genes
        return similarity >= threshold




class PowerPointGenerator:
    """Generates PowerPoint slides from GA-optimized layouts"""

    def __init__(self):
        self.presentation = None
        self.slide_width = Inches(10)
        self.slide_height = Inches(7.5)

        # Define layout dimensions (as fractions of slide dimensions)
        self.layout_areas = {
            'title': {'top': 0.05, 'left': 0.05, 'width': 0.9, 'height': 0.15},
            'content': {'top': 0.25, 'left': 0.05, 'width': 0.9, 'height': 0.7},
            'image_left': {'top': 0.25, 'left': 0.05, 'width': 0.4, 'height': 0.7},
            'image_right': {'top': 0.25, 'left': 0.55, 'width': 0.4, 'height': 0.7},
            'image_top': {'top': 0.25, 'left': 0.05, 'width': 0.9, 'height': 0.35},
            'image_bottom': {'top': 0.6, 'left': 0.05, 'width': 0.9, 'height': 0.35},
            'text_left': {'top': 0.25, 'left': 0.05, 'width': 0.45, 'height': 0.7},
            'text_right': {'top': 0.25, 'left': 0.5, 'width': 0.45, 'height': 0.7},
            'text_top': {'top': 0.25, 'left': 0.05, 'width': 0.9, 'height': 0.35},
            'text_bottom': {'top': 0.6, 'left': 0.05, 'width': 0.9, 'height': 0.35}
        }

        # Color mappings
        self.color_map = {
            'black': RGBColor(0, 0, 0),
            'white': RGBColor(255, 255, 255),
            'dark_gray': RGBColor(64, 64, 64),
            'light_gray': RGBColor(240, 240, 240),
            'blue': RGBColor(0, 112, 192),
            'dark_blue': RGBColor(0, 32, 96),
            'light_blue': RGBColor(173, 216, 230)
        }

        # Margin size mappings (in inches)
        self.margin_sizes = {
            'small': 0.3,
            'medium': 0.5,
            'large': 0.8
        }

    def create_presentation_from_layouts(self, slides_content: List[SlideContent],
                                       optimized_layouts: List[SlideLayout],
                                       output_path: str = "optimized_presentation.pptx") -> str:
        """Create a complete PowerPoint presentation from optimized layouts"""

        print(f"🎯 Generating PowerPoint presentation with {len(slides_content)} slides...")

        # Create new presentation
        self.presentation = Presentation()

        # Remove default slide layout
        if len(self.presentation.slides) > 0:
            slide_to_remove = self.presentation.slides[0]
            rId = self.presentation.slides._sldIdLst[0].rId
            self.presentation.part.drop_rel(rId)
            del self.presentation.slides._sldIdLst[0]

        # Generate each slide
        for i, (content, layout) in enumerate(zip(slides_content, optimized_layouts)):
            print(f"  📄 Creating slide {i+1}: '{content.title[:40]}...'")
            self._create_slide(content, layout)

        # Save presentation
        self.presentation.save(output_path)
        print(f"✅ Presentation saved as: {output_path}")

        return output_path

    def _create_slide(self, content: SlideContent, layout: SlideLayout):
        """Create a single slide based on content and optimized layout"""

        # Add blank slide
        blank_slide_layout = self.presentation.slide_layouts[6]  # Blank layout
        slide = self.presentation.slides.add_slide(blank_slide_layout)

        # Apply background color
        self._apply_background(slide, layout)

        # Add title
        self._add_title(slide, content, layout)

        # Add content based on layout
        if content.images:
            self._add_content_with_images(slide, content, layout)
        else:
            self._add_text_only_content(slide, content, layout)

        return slide

    def _apply_background(self, slide, layout: SlideLayout):
        """Apply background color to slide"""
        bg_color = layout.genes['background_color']

        if bg_color != 'white':  # Default is white
            background = slide.background
            fill = background.fill
            fill.solid()
            fill.fore_color.rgb = self.color_map[bg_color]

    def _add_title(self, slide, content: SlideContent, layout: SlideLayout):
        """Add title to slide based on layout specifications"""

        title_pos = layout.genes['title_position']
        title_font_size = layout.genes['title_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        # Determine title position and size
        if title_pos == 'top':
            left = Inches(margin)
            top = Inches(margin)
            width = self.slide_width - Inches(2 * margin)
            height = Inches(1.2)
            alignment = PP_ALIGN.LEFT
        elif title_pos == 'center':
            left = Inches(margin)
            top = Inches(2.5)
            width = self.slide_width - Inches(2 * margin)
            height = Inches(1.2)
            alignment = PP_ALIGN.CENTER
        elif title_pos == 'left':
            left = Inches(margin)
            top = Inches(margin)
            width = self.slide_width * 0.4
            height = Inches(1.2)
            alignment = PP_ALIGN.LEFT
        else:  # right
            left = self.slide_width * 0.6
            top = Inches(margin)
            width = self.slide_width * 0.35
            height = Inches(1.2)
            alignment = PP_ALIGN.RIGHT

        # Add title textbox
        title_box = slide.shapes.add_textbox(left, top, width, height)
        title_frame = title_box.text_frame
        title_frame.clear()
        title_frame.margin_left = Inches(0.1)
        title_frame.margin_right = Inches(0.1)
        title_frame.margin_top = Inches(0.1)
        title_frame.margin_bottom = Inches(0.1)

        # Add title text
        p = title_frame.paragraphs[0]
        p.text = content.title
        p.alignment = alignment

        # Format title
        font = p.font
        font.size = Pt(title_font_size)
        font.bold = True
        font.color.rgb = self.color_map[text_color]

        return title_box

    def _add_text_only_content(self, slide, content: SlideContent, layout: SlideLayout):
        """Add text content when no images are present"""

        bullet_columns = layout.genes['bullet_columns']
        bullet_font_size = layout.genes['bullet_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        # Calculate content area
        content_top = Inches(1.8)  # Below title
        content_left = Inches(margin)
        content_width = self.slide_width - Inches(2 * margin)
        content_height = self.slide_height - content_top - Inches(margin)

        if bullet_columns == 1:
            # Single column
            self._add_bullet_textbox(
                slide, content.bullets,
                content_left, content_top, content_width, content_height,
                bullet_font_size, text_color
            )
        else:
            # Two columns
            col_width = (content_width - Inches(0.5)) / 2
            mid_point = len(content.bullets) // 2

            # Left column
            left_bullets = content.bullets[:mid_point]
            self._add_bullet_textbox(
                slide, left_bullets,
                content_left, content_top, col_width, content_height,
                bullet_font_size, text_color
            )

            # Right column
            right_bullets = content.bullets[mid_point:]
            self._add_bullet_textbox(
                slide, right_bullets,
                content_left + col_width + Inches(0.5), content_top, col_width, content_height,
                bullet_font_size, text_color
            )

    def _add_content_with_images(self, slide, content: SlideContent, layout: SlideLayout):
        """Add content with images based on layout specifications"""

        image_position = layout.genes['image_position']
        image_layout = layout.genes['image_layout']
        image_size = layout.genes['image_size']
        bullet_font_size = layout.genes['bullet_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        if image_position == 'grid':
            self._add_grid_layout(slide, content, layout)
        elif image_position in ['left', 'right']:
            self._add_side_by_side_layout(slide, content, layout)
        elif image_position in ['top', 'bottom']:
            self._add_stacked_layout(slide, content, layout)
        else:
            # Fallback to text-only
            self._add_text_only_content(slide, content, layout)

    def _add_side_by_side_layout(self, slide, content: SlideContent, layout: SlideLayout):
        """Add side-by-side image and text layout"""

        image_position = layout.genes['image_position']
        bullet_font_size = layout.genes['bullet_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        content_top = Inches(1.8)
        content_height = self.slide_height - content_top - Inches(margin)

        # Determine image and text areas based on image size
        size_ratios = {'small': 0.3, 'medium': 0.45, 'large': 0.6}
        img_ratio = size_ratios[layout.genes['image_size']]
        text_ratio = 1 - img_ratio - 0.05  # 5% gap

        if image_position == 'left':
            # Image on left, text on right
            img_left = Inches(margin)
            img_width = self.slide_width * img_ratio

            text_left = img_left + img_width + Inches(0.3)
            text_width = self.slide_width * text_ratio
        else:  # right
            # Text on left, image on right
            text_left = Inches(margin)
            text_width = self.slide_width * text_ratio

            img_left = text_left + text_width + Inches(0.3)
            img_width = self.slide_width * img_ratio

        # Add images
        self._add_images_to_area(slide, content.images, img_left, content_top,
                               img_width, content_height, layout)

        # Add text
        self._add_bullet_textbox(slide, content.bullets, text_left, content_top,
                               text_width, content_height, bullet_font_size, text_color)

    def _add_stacked_layout(self, slide, content: SlideContent, layout: SlideLayout):
        """Add stacked (top/bottom) image and text layout"""

        image_position = layout.genes['image_position']
        bullet_font_size = layout.genes['bullet_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        content_top = Inches(1.8)
        content_width = self.slide_width - Inches(2 * margin)
        total_height = self.slide_height - content_top - Inches(margin)

        # Determine image and text areas
        size_ratios = {'small': 0.3, 'medium': 0.5, 'large': 0.7}
        img_ratio = size_ratios[layout.genes['image_size']]
        text_ratio = 1 - img_ratio - 0.05

        if image_position == 'top':
            # Image on top, text below
            img_top = content_top
            img_height = total_height * img_ratio

            text_top = img_top + img_height + Inches(0.2)
            text_height = total_height * text_ratio
        else:  # bottom
            # Text on top, image below
            text_top = content_top
            text_height = total_height * text_ratio

            img_top = text_top + text_height + Inches(0.2)
            img_height = total_height * img_ratio

        # Add images
        self._add_images_to_area(slide, content.images, Inches(margin), img_top,
                               content_width, img_height, layout)

        # Add text
        self._add_bullet_textbox(slide, content.bullets, Inches(margin), text_top,
                               content_width, text_height, bullet_font_size, text_color)

    def _add_grid_layout(self, slide, content: SlideContent, layout: SlideLayout):
        """Add grid layout for multiple images"""

        bullet_font_size = layout.genes['bullet_font_size']
        text_color = layout.genes['text_color']
        margin = self.margin_sizes[layout.genes['margin_size']]

        content_top = Inches(1.8)
        content_width = self.slide_width - Inches(2 * margin)
        content_height = self.slide_height - content_top - Inches(margin)

        # Split area between images and text
        if len(content.images) >= 3:
            img_height = content_height * 0.6
            text_height = content_height * 0.35

            # Images in grid at top
            self._add_images_to_area(slide, content.images, Inches(margin), content_top,
                                   content_width, img_height, layout)

            # Text at bottom
            text_top = content_top + img_height + Inches(0.2)
            self._add_bullet_textbox(slide, content.bullets, Inches(margin), text_top,
                                   content_width, text_height, bullet_font_size, text_color)
        else:
            # Fallback to side-by-side for fewer images
            self._add_side_by_side_layout(slide, content, layout)

    def _add_images_to_area(self, slide, images: List[Image.Image], left, top, width, height, layout: SlideLayout):
        """Add PIL images to specified area"""

        if not images:
            return

        image_layout = layout.genes['image_layout']

        if len(images) == 1:
            self._add_single_image(slide, images[0], left, top, width, height)
        elif len(images) == 2:
            if image_layout == 'horizontal':
                self._add_two_images_horizontal(slide, images, left, top, width, height)
            else:  # vertical
                 self._add_two_images_vertical(slide, images, left, top, width, height)
        else:  # 3+ images
            self._add_multiple_images_grid(slide, images, left, top, width, height)

    def _add_single_image(self, slide, image: Image.Image, left, top, width, height):
        """Add a single image with aspect ratio preservation"""

        # Save PIL image to temporary file
        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
            image.save(temp_file.name, 'PNG')
            temp_path = temp_file.name

        try:
            # Calculate dimensions preserving aspect ratio
            img_width, img_height = image.size
            aspect_ratio = img_width / img_height

            # Fit image within available space
            if width / height > aspect_ratio:
                # Height is limiting factor
                final_height = height
                final_width = height * aspect_ratio
                img_left = left + (width - final_width) / 2
                img_top = top
            else:
                # Width is limiting factor
                final_width = width
                final_height = width / aspect_ratio
                img_left = left
                img_top = top + (height - final_height) / 2

            # Add image to slide
            slide.shapes.add_picture(temp_path, img_left, img_top, final_width, final_height)

        finally:
            # Clean up temporary file
            try:
                os.unlink(temp_path)
            except:
                pass

    def _add_two_images_horizontal(self, slide, images: List[Image.Image], left, top, width, height):
        """Add two images side by side"""

        img_width = (width - Inches(0.2)) / 2  # 0.2" gap between images

        # Add first image
        self._add_single_image(slide, images[0], left, top, img_width, height)

        # Add second image
        self._add_single_image(slide, images[1], left + img_width + Inches(0.2), top, img_width, height)

    def _add_two_images_vertical(self, slide, images: List[Image.Image], left, top, width, height):
        """Add two images stacked vertically"""

        img_height = (height - Inches(0.2)) / 2  # 0.2" gap between images

        # Add first image
        self._add_single_image(slide, images[0], left, top, width, img_height)

        # Add second image
        self._add_single_image(slide, images[1], left, top + img_height + Inches(0.2), width, img_height)

    def _add_multiple_images_grid(self, slide, images: List[Image.Image], left, top, width, height):
        """Add multiple images in a grid layout"""

        num_images = len(images)

        # Determine grid layout
        if num_images <= 4:
            cols = 2
            rows = (num_images + 1) // 2
        elif num_images <= 6:
            cols = 3
            rows = 2
        else:
            cols = 3
            rows = 3

        # Calculate image dimensions
        gap = Inches(0.1)
        img_width = (width - gap * (cols - 1)) / cols
        img_height = (height - gap * (rows - 1)) / rows

        # Add images to grid
        for i, image in enumerate(images[:rows * cols]):
            row = i // cols
            col = i % cols

            img_left = left + col * (img_width + gap)
            img_top = top + row * (img_height + gap)

            self._add_single_image(slide, image, img_left, img_top, img_width, img_height)

    def _add_bullet_textbox(self, slide, bullets: List[str], left, top, width, height,
                       font_size: int, text_color: str):
      """Add bullet points in a text box with simple bullet formatting"""

      if not bullets:
          return

      # Add textbox
      textbox = slide.shapes.add_textbox(left, top, width, height)
      text_frame = textbox.text_frame
      text_frame.clear()
      text_frame.margin_left = Inches(0.2)
      text_frame.margin_right = Inches(0.1)
      text_frame.margin_top = Inches(0.1)
      text_frame.margin_bottom = Inches(0.1)
      text_frame.word_wrap = True

      # Add bullets with simple formatting
      for i, bullet_text in enumerate(bullets):
          if i == 0:
              p = text_frame.paragraphs[0]
          else:
              p = text_frame.add_paragraph()

          # Add bullet character explicitly
          p.text = f"• {bullet_text}"
          p.alignment = PP_ALIGN.LEFT

          # Format font
          font = p.font
          font.size = Pt(font_size)
          font.color.rgb = self.color_map[text_color]
          font.name = 'Calibri'

      return textbox


class SlideTemplateManager:
    """Manages different slide templates and themes"""

    def __init__(self):
        self.templates = {
            'default': {
                'master_background': 'white',
                'accent_color': 'blue',
                'title_font': 'Calibri',
                'body_font': 'Calibri'
            },
            'dark': {
                'master_background': 'dark_blue',
                'accent_color': 'light_blue',
                'title_font': 'Calibri',
                'body_font': 'Calibri'
            },
            'modern': {
                'master_background': 'light_gray',
                'accent_color': 'dark_gray',
                'title_font': 'Segoe UI',
                'body_font': 'Segoe UI'
            },
            'light': {
                'master_background': 'white',
                'accent_color': 'dark_gray',
                'title_font': 'Arial',
                'body_font': 'Arial'
            }
        }

    def apply_template(self, presentation: Presentation, template_name: str):
        """Apply a template to the presentation"""
        if template_name not in self.templates:
            template_name = 'default'

        template = self.templates[template_name]

        # Apply template-specific formatting
        # This would involve modifying the slide master, but for simplicity
        # we'll handle this at the individual slide level
        return template



class IntegratedPresentationGenerator:
    """Main class that integrates GA optimization with PowerPoint generation"""

    def __init__(self, ga_params: Dict[str, Any] = None):
        self.ga_optimizer = GeneticAlgorithmLayoutOptimizer(
            **(ga_params or {
                'population_size': 40,
                'generations': 60,
                'mutation_rate': 0.12,
                'elite_size': 6
            })
        )
        self.ppt_generator = PowerPointGenerator()
        self.template_manager = SlideTemplateManager()

    def generate_presentation_from_content(self, slides_data: List[Dict[str, Any]],
                                         output_path: str = "ga_optimized_presentation.pptx",
                                         template: str = "default") -> Dict[str, Any]:
        """Complete pipeline: content -> GA optimization -> PowerPoint generation"""

        print("🚀 Starting Integrated Presentation Generation Pipeline")
        print("=" * 60)

        # Step 1: Convert input data to SlideContent objects
        print("📊 Step 1: Processing slide content...")
        slides_content = []
        for i, slide_data in enumerate(slides_data):
            content = SlideContent(
                title=slide_data['title'],
                bullets=slide_data['bullets'],
                images=slide_data.get('images', [])
            )
            slides_content.append(content)
            print(f"  Slide {i+1}: {len(content.bullets)} bullets, {len(content.images)} images")

        # Step 2: Optimize layouts using GA
        print(f"\n🧬 Step 2: Optimizing layouts with Genetic Algorithm...")
        optimized_layouts = self.ga_optimizer.optimize_slide_layouts(slides_content)

        # Step 3: Generate PowerPoint presentation
        print(f"\n📋 Step 3: Generating PowerPoint presentation...")
        ppt_path = self.ppt_generator.create_presentation_from_layouts(
            slides_content, optimized_layouts, output_path
        )

        print(f"\n✅ Pipeline Complete!")
        print(f"📄 PowerPoint file: {ppt_path}")

        return {
            'ppt_path': ppt_path,
            'optimized_layouts': optimized_layouts,
        }

#### GA Calling Function

In [11]:
def GA_pptx_generation(slides:list[Dict],output_path:str):
    demo_slides = slides
    generator = IntegratedPresentationGenerator({
        'population_size': 500,
        'generations': 100,
        'mutation_rate': 0.12,
        'elite_size': 10
    })
    results = generator.generate_presentation_from_content(
        demo_slides,
        output_path=output_path # Write here your output path
    )
    return results

# Endpoints using FastAPI

In [12]:
!pip install fastapi uvicorn pyngrok nest_asyncio python-multipart



In [22]:
import nest_asyncio
nest_asyncio.apply()

from fastapi import FastAPI, File, UploadFile, Query,Path, Body
from pyngrok import ngrok
from typing import Dict, List,Any
import uvicorn
import shutil
from tempfile import NamedTemporaryFile
import base64
from io import BytesIO
from fastapi.responses import JSONResponse, FileResponse
from PIL import Image

app = FastAPI()

import uuid

# Global in-memory store for demo (replace with a real DB or persistent storage)
DOCUMENT_STORE = {}


In [23]:
# add the file and save the processed data of the file in DOCUMENT_STORE with uuid
@app.post("/process_document")
async def process_document(file: UploadFile = File(...)):
    with NamedTemporaryFile(delete=False, suffix=f"_{file.filename}") as tmp:
        shutil.copyfileobj(file.file, tmp)
        temp_file_path = tmp.name

    doc_id = str(uuid.uuid4())

    # Store file path and filename in DOCUMENT_STORE
    DOCUMENT_STORE[doc_id] = {
        "filename": file.filename,
        "filepath": temp_file_path
    }

    # Process document normally
    doc_pipeline = DocumentProcessingPipeline(temp_file_path,"semantic")
    results = doc_pipeline.run()

    # Cache results in DOCUMENT_STORE so you can reuse embeddings & titles
    DOCUMENT_STORE[doc_id]["results"] = results

    return {
        "doc_id": doc_id,
        "titles": results["titles"],
    }


In [24]:
@app.get("/slides/{doc_id}")
async def get_slides_json(
    doc_id: str = Path(..., title="The ID of the document"),
    titles: Dict[str, List[int]] = Body(...)
):
    if doc_id not in DOCUMENT_STORE:
        return JSONResponse(status_code=404, content={"error": "Document ID not found"})

    results = DOCUMENT_STORE[doc_id]["results"]
    selected_indices = titles.get("titles", [])

    # slide_builder = SlideBuilder(
#     titles=[title.capitalize() for title in results["titles"]],
#     # titles=results["constructed_queries"],
#     lines=results["lines"],
#     images=results["images"],
#     db_manager=results["db_manager"],
#     # text_embedder = results["text_embedder"],
#     title_embeddings = results["title_embeddings"],
#     summarizer = results["summarizer"],
#     # title_embeddings = results["text_embedder"].encode(results["constructed_queries"]),
# )

# slides = slide_builder.build_slides(summarization_strategy="cot", max_image_occurrences=1)
    selected_titles = [results["titles"][i] for i in selected_indices]

    slide_builder = SlideBuilder(
        titles=[title.capitalize() for title in selected_titles],
        # titles=results["constructed_queries"],
        lines=results["lines"],
        images=results["images"],
        db_manager=results["db_manager"],
        # text_embedder = results["text_embedder"],
        title_embeddings = results["title_embeddings"],
        summarizer = results["summarizer"],
        # title_embeddings = results["text_embedder"].encode(results["constructed_queries"]),
    )

    # slides = slide_builder.build_slides()
    slides = slide_builder.build_slides(summarization_strategy="cot", max_image_occurrences=1)
    # print(slides)
    for slide in slides:
        serialized_images = []
        for img in slide.get("images", []):
            if isinstance(img, Image.Image):
                buf = io.BytesIO()
                img.save(buf, format="PNG")
                image_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
                serialized_images.append(image_b64)
            else:
                serialized_images.append(None)
        slide["images"] = serialized_images

    return JSONResponse(content={"slides": slides})


In [25]:
@app.post("/Defualt_generate_pptx/{doc_id}")
async def generate_pptx_from_slides(
    doc_id: str = Path(..., title="The ID of the document"),
    slides: List[Dict[str, Any]] = Body(...)
):
    if doc_id not in DOCUMENT_STORE:
        return JSONResponse(status_code=404, content={"error": "Document ID not found"})

    # Decode base64 images back to PIL.Image objects
    for slide in slides:
        if "images" in slide and isinstance(slide["images"], list):
            decoded_images = []
            for img_b64 in slide["images"]:
                if isinstance(img_b64, str):
                    img_bytes = base64.b64decode(img_b64)
                    img = Image.open(io.BytesIO(img_bytes))
                    decoded_images.append(img)
                else:
                    decoded_images.append(None)
            slide["images"] = decoded_images

    file_name = DOCUMENT_STORE[doc_id]["filename"].replace(".pdf", "")
    file_path = f"/kaggle/working/{file_name}.pptx"
    DOCUMENT_STORE[doc_id]["defualt_pptx_path"] = file_path
    
    # calling the builder
    builder = PresentationBuilder(
            title=file_name,
            font_name="Arial",
            font_size=20,
            # template_path="/kaggle/working/template_2.pptx"
        )
    
    # build from list takes [{title,images,bulltes},{title,images,bulltes},{title,images,bulltes}]
    builder.build_from_list(slides)
    # saving the file
    builder.save(file_path)
    
    # ensure that the file saved successfully to return it.
    if not os.path.exists(file_path):
        return JSONResponse(status_code=500, content={"error": "Failed to generate PowerPoint"})
    
    return FileResponse(
        path=file_path,
        media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
        filename=f"{file_name}.pptx"
    )


In [26]:
@app.post("/GA_generate_pptx/{doc_id}")
async def generate_pptx_from_slides(
    doc_id: str = Path(..., title="The ID of the document"),
    slides: List[Dict[str, Any]] = Body(...)
):
    if doc_id not in DOCUMENT_STORE:
        return JSONResponse(status_code=404, content={"error": "Document ID not found"})

    # Decode base64 images back to PIL.Image objects
    for slide in slides:
        if "images" in slide and isinstance(slide["images"], list):
            decoded_images = []
            for img_b64 in slide["images"]:
                if isinstance(img_b64, str):
                    img_bytes = base64.b64decode(img_b64)
                    img = Image.open(io.BytesIO(img_bytes))
                    decoded_images.append(img)
                else:
                    decoded_images.append(None)
            slide["images"] = decoded_images

    file_name = DOCUMENT_STORE[doc_id]["filename"].replace(".pdf", "")
    file_path = f"/kaggle/working/{file_name}.pptx"
    DOCUMENT_STORE[doc_id]["ga_pptx_path"] = file_path

    GA_pptx_generation(slides, file_path)

    if not os.path.exists(file_path):
        return JSONResponse(status_code=500, content={"error": "Failed to generate PowerPoint"})

    return FileResponse(
        path=file_path,
        media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
        filename=f"{file_name}.pptx"
    )


In [27]:
@app.get("/defualt_doc/{doc_id}")
async def download_pptx( doc_id: str = Path(..., title="The ID of the document")):
    if doc_id not in DOCUMENT_STORE:
        return JSONResponse(status_code=404, content={"error": "Document ID not found"})
    file_path=DOCUMENT_STORE[doc_id]["defualt_pptx_path"]
    if not os.path.exists(file_path):
        return JSONResponse(status_code=404, content={"error": "File not found"})
    file_name = DOCUMENT_STORE[doc_id]["filename"].replace(".pdf","")
    return FileResponse(
        path=file_path,
        media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
        filename=f"{file_name}.pptx"
    )
    

In [28]:
@app.get("/ga_doc/{doc_id}")
async def download_pptx( doc_id: str = Path(..., title="The ID of the document")):
    if doc_id not in DOCUMENT_STORE:
        return JSONResponse(status_code=404, content={"error": "Document ID not found"})
    file_path=DOCUMENT_STORE[doc_id]["ga_pptx_path"]
    if not os.path.exists(file_path):
        return JSONResponse(status_code=404, content={"error": "File not found"})
    file_name = DOCUMENT_STORE[doc_id]["filename"].replace(".pdf","")
    return FileResponse(
        path=file_path,
        media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
        filename=f"{file_name}.pptx"
    )
    

In [39]:

import requests
import json
def update_gist(url):
    gist_id = "79ec449dc37d6264520ed2f92c88ff4c"
    github_token = "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"
    gist_api_url = f"https://api.github.com/gists/{gist_id}"
    
    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json"
    }

    payload = {
        "files": {
            "ngrok_url.json": {
                "content": json.dumps({"ngrok_url": url.public_url})
            }
        }
    }

    try:
        response = requests.patch(gist_api_url, headers=headers, json=payload)
        response.raise_for_status()
        print(f"✅ Successfully updated Gist with URL: {url.public_url}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to update Gist: {e}")


In [None]:
# Conneceting the Server of Ngrok for hosting
# Set your actual ngrok auth token here
ngrok.set_auth_token("2yCOBdqsmCFkgIA5vGqo3kl9rn4_7kRvgddoeXhcH8h5g9koj")

# Open a tunnel on port 8000
public_url = ngrok.connect(8000)
update_gist(public_url)
print(f"🌐 BaseURL: {public_url.public_url}")
print(f"🌐 Public URL: {public_url.public_url}/process_document /POST") # send doc
print(f"🌐 Public URL: {public_url.public_url}/selected_titles/doc_id /POST") # get titles
print(f"🌐 Public URL: {public_url.public_url}/download/doc_id /GET") # download slides

# Run the server normally (blocking call)
uvicorn.run(app, host="0.0.0.0", port=8000)

✅ Successfully updated Gist with URL: https://0463-104-155-145-222.ngrok-free.app
🌐 BaseURL: https://0463-104-155-145-222.ngrok-free.app
🌐 Public URL: https://0463-104-155-145-222.ngrok-free.app/process_document /POST
🌐 Public URL: https://0463-104-155-145-222.ngrok-free.app/selected_titles/doc_id /POST
🌐 Public URL: https://0463-104-155-145-222.ngrok-free.app/download/doc_id /GET


INFO:     Started server process [575]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


In [37]:
ngrok.kill()  # closes all existing tunnels before opening a new one
