In [1]:
from section_generator import generate_slide_content
import json
import pptx

In [9]:
from document_parser import extract_text_and_tables

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from multi_document_rag import MultiDocumentRAG

In [12]:
document_path = "docs/somatosensory.pdf"

In [11]:
rag = MultiDocumentRAG()


In [13]:
rag.process_documents([document_path])


Processing document: docs/somatosensory.pdf
Started parsing the file under job_id 3b6b623b-f917-4274-8f90-e8985f643ed3
**************************************************
JSON data has been written to document_parsed.json
**************************************************
> Images for page 1: []
> Images for page 2: [{'name': 'img_p1_1.png', 'height': 233, 'width': 400, 'x': 226.7717, 'y': 85.37800000000001, 'original_width': 400, 'original_height': 233, 'ocr': [{'x': 76, 'y': 6, 'w': 54, 'h': 52, 'confidence': 0.44079593907240605, 'text': 'Iml'}, {'x': 148, 'y': 8, 'w': 24, 'h': 46, 'confidence': 0.8859970557807628, 'text': 'h'}, {'x': 34, 'y': 126, 'w': 32, 'h': 74, 'confidence': 0.9346045750761256, 'text': '1'}]}]
> Images for page 3: []
> Images for page 4: []
**************************************************
Images have been extracted and saved to ./images
**************************************************
Renamed 3b6b623b-f917-4274-8f90-e8985f643ed3-img_p1_1.png to img_p1_1.png


In [46]:
with open('document_parsed.json', 'r') as f:
    json_result = json.load(f)

In [50]:
# 3. Extract text and tables
print("📝 Extracting text and tables from parsed document...")
text = extract_text_and_tables(json_result)

📝 Extracting text and tables from parsed document...


In [54]:
from text_chunker import chunk_text

In [55]:
min_chunk_size = 1000
max_chunk_size = 5000

In [56]:
# 4. Chunk the text
print("✂️ Chunking text...")
text_chunks = chunk_text(text, min_chunk_size=min_chunk_size, max_chunk_size=max_chunk_size)
print(f"Generated {len(text_chunks)} text chunks.")

✂️ Chunking text...
Generated 6 text chunks.


In [57]:
text_chunks

['MAY 2023 AIRBNB: THE FIRST FIVE YEARS Pablo Picasso once said, “It took me four years to paint like Raphael, but a lifetime to paint like a child.” I think you must always live and think like a child. Or have that childlike curiosity and wonder… What’s the next thing? I like to imagine the world five years from now. Or imagine what I want the world to look like five years from now. And when I think back to when we started Airbnb, we were trying to challenge the status quo. Now we’re trying to challenge ourselves.1\n\n― Brian Chesky\n\nIn June 2021, Airbnb was the world’s leading lodging marketplace. With four million hosts offering about seven million listings in over 220 countries and regions2 and profitable third quarters in each of the years 2018, 2019 and 20203 (in spite of the Covid‑19 pandemic), Airbnb went public in a highly successful IPO. The company was founded in 2007 by Brian Chesky, Joe Gebbia and Nathan Blecharczyk. Gebbia tells the story of early Airbnb in this video. 

In [6]:
import json
# Read content from section_content.json
with open('slide_content.json', 'r') as f:
    slide_content = json.load(f)


In [1]:
from pptx import Presentation

template_path = "available_templates/A.pptx"
prs = Presentation(template_path)


In [2]:
def get_layout_specs(prs):
    layout_specs = []
    for i, layout in enumerate(prs.slide_layouts):
        supported = set()
        for shape in layout.placeholders:
            name = shape.name.lower()
            if 'title' in name:
                supported.add('title')
            if 'content' in name:
                supported.add('bullets')
            if 'picture' in name or 'image' in name:
                supported.add('image_path')
            if 'caption' in name:
                supported.add('caption')
            if 'notes' in name:
                supported.add('speaker_notes')

        layout_specs.append({
            "layout_id": i,
            "name": layout.name,
            "supports": list(supported)
        })
    return layout_specs


In [5]:
layout_specs = get_layout_specs(prs)
layout_specs

[{'layout_id': 0, 'name': 'Title Slide', 'supports': ['title']},
 {'layout_id': 1,
  'name': 'Title and content with image right',
  'supports': ['bullets', 'title', 'image_path']},
 {'layout_id': 2,
  'name': 'Title + subtitle + picture',
  'supports': ['title', 'image_path']},
 {'layout_id': 3, 'name': 'Title + subtitle', 'supports': ['title']},
 {'layout_id': 4,
  'name': 'Title and content 2',
  'supports': ['bullets', 'title', 'image_path']},
 {'layout_id': 5,
  'name': 'Title + picture',
  'supports': ['title', 'image_path']},
 {'layout_id': 6,
  'name': 'Content + picture ',
  'supports': ['bullets', 'title', 'image_path']},
 {'layout_id': 7,
  'name': 'Two content light blue',
  'supports': ['bullets', 'title']},
 {'layout_id': 8,
  'name': 'Title and two content',
  'supports': ['bullets', 'title']},
 {'layout_id': 9,
  'name': 'Two content white',
  'supports': ['bullets', 'title']},
 {'layout_id': 10, 'name': 'Table', 'supports': ['title']},
 {'layout_id': 11,
  'name': 'Two

In [43]:
from openai import OpenAI
import json

outputs = []

for slide in slide_content:
    print(slide)

    prompt = f"""
        You are an expert presentation assistant. Based on the following available slide layouts, choose the best layout and return structured JSON output.

        Here are the layout options:
        {json.dumps(layout_specs, indent=2)}

        Slide content to reformat:
        {json.dumps(slide, indent=2)}
    """

    client = OpenAI()

    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You format slides based on layout specifications."},
        {"role": "user", "content": prompt}
    ],
    functions=[
        {
            "name": "choose_slide_layout_and_format",
            "description": "Choose the best layout and reformat the content accordingly",
            "parameters": {
                "type": "object",
                "properties": {
                    "slide_number": {"type": "integer"},
                    "layout_id": {"type": "integer"},
                    "layout_name": {"type": "string"},
                    "title": {"type": "string"},
                    "content": {
                        "type": "object",
                        "properties": {
                            "bullets": {"type": "array", "items": {"type": "string"}},
                            "image_path": {"type": "string"},
                            "speaker_notes": {"type": "string"},
                            "caption": {"type": "string"}
                        },
                        "additionalProperties": False
                    }
                },
                "required": ["slide_number", "layout_id", "layout_name", "title", "content"]
            }
        }
    ],
    function_call={"name": "choose_slide_layout_and_format"}
)
    
    # To extract the structured output:
    function_args = response.choices[0].message.function_call.arguments
    parsed_output = json.loads(function_args)

    outputs.append(parsed_output)

{'slide': 1, 'slide_title': 'The Early Days of Airbnb: Disrupting the Lodging Industry', 'slide_content': {'bullets': ['In October 2007, Brian Chesky and Joe Gebbia start Airbnb by renting airbeds in their San Francisco apartment during a design conference to fund their rent.', 'The addition of Nathan Blecharczyk as a technical lead in February 2008 and the successful launch of AirBed&Breakfast during the Democratic National Convention in August 2008 marked significant milestones.', "Chesky and Gebbia's focus on high-quality property photos significantly increased conversion rates in New York City, leading to doubled revenues and enhanced user experiences.", "Airbnb's early success was attributed to the founders' determination to provide unique guest experiences and personal connections."], 'speaker_notes': "This slide sets the stage for Airbnb's inception and early growth, emphasizing the pivotal moments that shaped the company's disruptive approach to lodging.", 'image_paths': ['airb

In [44]:
outputs

[{'slide_number': 1,
  'layout_id': 1,
  'layout_name': 'Title and content with image right',
  'title': 'The Early Days of Airbnb: Disrupting the Lodging Industry',
  'content': {'bullets': ['In October 2007, Brian Chesky and Joe Gebbia start Airbnb by renting airbeds in their San Francisco apartment during a design conference to fund their rent.',
    'The addition of Nathan Blecharczyk as a technical lead in February 2008 and the successful launch of AirBed&Breakfast during the Democratic National Convention in August 2008 marked significant milestones.',
    "Chesky and Gebbia's focus on high-quality property photos significantly increased conversion rates in New York City, leading to doubled revenues and enhanced user experiences.",
    "Airbnb's early success was attributed to the founders' determination to provide unique guest experiences and personal connections."],
   'image_path': 'airbnb_early_timeline_infographic.jpg',
   'speaker_notes': "This slide sets the stage for Airb

In [45]:
# Write outputs to a JSON file
with open('slide_layouts.json', 'w') as f:
    json.dump(outputs, f, indent=2)
print(f"✅ Saved {len(outputs)} slide layouts to slide_layouts.json")


✅ Saved 10 slide layouts to slide_layouts.json


In [59]:
from slide_content_generator import generate_slide_content

In [60]:
with open('presentation_data.json', 'r') as f:
    presentation_data = json.load(f)

slide_content = generate_slide_content(presentation_data)

In [61]:
slide_content

[Slide(slide=1, slide_title='Airbnb: The Journey of Disruption', slide_content=SlideContent(bullets=["Introduction to Airbnb's Founding and Growth", 'Key Milestones and Challenges Faced', 'Importance of Innovation and Resilience', 'Impact of Covid-19 and IPO Success'], speaker_notes="This slide sets the stage for the presentation, highlighting Airbnb's beginnings, key achievements, and the hurdles overcome, leading up to its IPO success despite the challenges faced.", image_paths=[])),
 Slide(slide=2, slide_title='Early Days of Airbnb: Innovation Sparks Vision', slide_content=SlideContent(bullets=['Innovative Idea of Offering Airbeds in San Francisco', 'Introduction of Nathan Blecharczyk and Launch of AirBed&Breakfast', 'Success at Democratic National Convention in 2008', "Foundation for Airbnb's Disruptive Impact on Hospitality Industry"], speaker_notes="This slide delves into the innovative beginnings of Airbnb, highlighting key events like the airbed concept and the platform's early

In [63]:
# Convert Slide objects to dictionaries before JSON serialization
slide_content_dict = [slide.dict() for slide in slide_content]
with open('slide_content.json', 'w') as f:
    json.dump(slide_content_dict, f, indent=2)

C:\Users\dell\AppData\Local\Temp\ipykernel_39040\193362915.py:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  slide_content_dict = [slide.dict() for slide in slide_content]


In [2]:
import json

In [3]:
with open('slide_content.json', 'r') as f:
    slide_content = json.load(f)


In [4]:
from layout_generator3 import generate_layout

In [5]:
print("Generating slide layouts...")

layout_output = generate_layout(slide_content)

Generating slide layouts...
{'slide': 1, 'slide_title': 'Airbnb: A Journey of Innovation and Disruption', 'slide_content': {'bullets': ['Founded in 2007 by Brian Chesky, Joe Gebbia, and Nathan Blecharczyk', "Evolution into the world's leading lodging marketplace by June 2021", 'Innovative ideas like offering airbeds during a design conference sparked the vision for Airbnb', 'Key milestones include the addition of Nathan Blecharczyk as a technical lead and the launch during the Democratic National Convention in August 2008'], 'speaker_notes': 'Introduce the audience to the inception and early successes of Airbnb, highlighting the innovative approach taken by the founders.', 'image_paths': ['airbnb_growth_over_the_years.jpg']}}
{'slide': 2, 'slide_title': 'Exploring Early Airbnb Experiences', 'slide_content': {'bullets': ['First three Airbnb guests sign up during a design conference in San Francisco', 'Guests include individuals older and closer to mainstream than expected', 'Experience 

KeyboardInterrupt: 

In [1]:
import json

In [2]:
with open('presentation_data.json', 'r') as f:
    presentation_data = json.load(f)

In [3]:
from slide_content_generator import generate_slide_content

In [4]:
slides = generate_slide_content(
    presentation_data=presentation_data,
    minimum_slides=7
)

In [5]:
slides[1]

PresentationMetadata(title='Airbnb: A Journey of Innovation and Growth', subtitle="Exploring the Evolution of the World's Leading Lodging Marketplace")

In [5]:
slide_contents = []
for slide in slides[0]:
    slide_content = {}
    slide_content['slide_number'] = slide.slide
    slide_content['slide_title'] = slide.slide_title
    slide_content['bullets'] = slide.slide_content.bullets
    slide_content['speaker_notes'] = slide.slide_content.speaker_notes
    slide_content['image_paths'] = slide.slide_content.image_paths

    from PIL import Image
    import os

    # Get image dimensions for each image path
    image_dimensions = []
    for image_path in slide_content['image_paths']:
        try:
            full_path = os.path.join('images', image_path)
            with Image.open(full_path) as img:
                width, height = img.size
                image_dimensions.append({
                    'path': image_path,
                    'width': width,
                    'height': height
                })
        except Exception as e:
            print(f"Error getting dimensions for {image_path}: {str(e)}")
            image_dimensions.append({
                'path': image_path,
                'width': None,
                'height': None
            })
    
    slide_content['image_dimensions'] = image_dimensions

    print(json.dumps(slide_content, indent=2))
    slide_contents.append(slide_content)

{
  "slide_number": 1,
  "slide_title": "Early Days of Airbnb: Revolutionizing Travel",
  "bullets": [
    "Founded in 2007 by Brian Chesky, Joe Gebbia, and Nathan Blecharczyk",
    "Launch of AirBed&Breakfast in August 2008 at the Democratic National Convention",
    "Unique experience of staying with locals and lower prices compared to hotels"
  ],
  "speaker_notes": "The early success of Airbnb stemmed from challenging assumptions and creating a new lodging model.",
  "image_paths": [
    "airbnb_launch_at_democratic_national_convention.jpg"
  ],
  "image_dimensions": [
    {
      "path": "airbnb_launch_at_democratic_national_convention.jpg",
      "width": 1200,
      "height": 623
    }
  ]
}
{
  "slide_number": 2,
  "slide_title": "Financial Creativity and Growth Challenges",
  "bullets": [
    "Designed and sold Obama O's and Cap'n McCain's cereal boxes in 2008",
    "Struggles to secure funding due to skepticism and economic downturn",
    "Renewed commitment after contemplati

In [6]:
import json
# Write slide_contents to a local JSON file
with open('slide_contents.json', 'w') as f:
    json.dump(slide_contents, f, indent=2)

In [7]:
import json
# Read slide contents from JSON file
with open('slide_contents.json', 'r') as f:
    slide_contents = json.load(f)


In [8]:
from slide_content_generator import build_prompt_with_placeholder_indices_and_dimensions,get_llm_friendly_layouts

In [9]:
layout_specs = get_llm_friendly_layouts("available_templates/A.pptx")

In [12]:
prompt = build_prompt_with_placeholder_indices_and_dimensions(slide_contents[4],layout_specs)

In [13]:
print(prompt)


You are an expert presentation assistant. Consider the font size to be 24pt.

Your task is to:
1. Choose the most appropriate slide layout from the list below.
2. Assign each content element (e.g., title, bullets, image, speaker notes) to the best-fitting placeholder within that layout.

Each layout includes:
- `layout_id`: an integer used to reference the layout.
- `layout_name`: name of the layout.
- `placeholders`: a list of available content slots, each described with:
  - `name`: the internal name of the placeholder.
  - `placeholder_type`: one of TITLE, BODY, PICTURE, SUBTITLE, SLIDE_NUMBER, etc.
  - `index`: the internal ID used to refer to this placeholder.
  - `position`: its top-left location on the slide (in inches).
  - `size`: its width and height (in inches).

Please choose a layout and map each content element to a placeholder by matching its `placeholder_type`. Use the `size` to decide which layout has BODY that is most suitable to fit the bullets.

Respond with a JSON

In [None]:
def get_layout_mapping(prompt):
    client = OpenAI()
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a presentation expert who maps content to appropriate slide layouts."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=1000
    )

    layout_mapping = response.choices[0].message.content.strip()
    layout_mapping = layout_mapping.replace('```json', '').replace('```', '').rstrip(',')
    layout_mapping = json.loads(layout_mapping)
    
    return layout_mapping

# Example usage:
layout_mapping = get_layout_mapping(prompt)
print(json.dumps(layout_mapping, indent=2))


{
  "slide_number": 5,
  "layout_id": 1,
  "layout_name": "Title and content with image right",
  "mapping": [
    {
      "content_type": "title",
      "value": "Investment Milestone",
      "placeholder_type": "TITLE",
      "placeholder_index": 0
    },
    {
      "content_type": "bullets",
      "value": [
        "Securing $7.2 million Series A funding from Greylock Partners and Sequoia Capital",
        "Facing competition from Wimdu and strategic decisions to maintain competitive edge",
        "Balancing growth and mission-driven focus"
      ],
      "placeholder_type": "OBJECT",
      "placeholder_index": 1
    },
    {
      "content_type": "image_path",
      "value": "reid_hoffman's_investment_reflections.jpg",
      "placeholder_type": "PICTURE",
      "placeholder_index": 13
    },
    {
      "content_type": "speaker_notes",
      "value": "Investment challenges and competitive dynamics shaped Airbnb's strategic direction.",
      "placeholder_type": "SLIDE_NUMBER",
 