In [37]:
!pip install boto3 python-dotenv



In [None]:
import os
import time
import boto3
import pandas as pd
from dotenv import load_dotenv
from collections import defaultdict
import matplotlib.pyplot as plt
from PIL import Image


load_dotenv()

class TextractProcessor:
    def __init__(self, base_dir, bucket_name=None):
        self.textract = boto3.client(
            'textract',
            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name=os.getenv("AWS_DEFAULT_REGION")
        )
        self.s3 = boto3.client(
            's3',
            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name=os.getenv("AWS_DEFAULT_REGION")
        )
        self.bucket_name = bucket_name or os.getenv("AWS_BUCKET_NAME")
        self.base_dir = base_dir

    def process_pdf(self, file_path, save_to_disk=True):
        file_name_no_ext = os.path.splitext(os.path.basename(file_path))[0]
        output_folder = os.path.join(self.base_dir, file_name_no_ext)
        os.makedirs(output_folder, exist_ok=True)

        # Upload to S3
        s3_key = f'{os.path.basename(self.base_dir)}/{os.path.basename(file_path)}'
        self.s3.upload_file(file_path, self.bucket_name, s3_key)
        print(f"Uploaded to S3: {s3_key}")

        # Start Textract job
        response = self.textract.start_document_analysis(
            DocumentLocation={'S3Object': {'Bucket': self.bucket_name, 'Name': s3_key}},
            FeatureTypes=['LAYOUT', 'TABLES']
        )
        job_id = response['JobId']
        print(f"Textract Job Started: {job_id}")

        # Poll for completion
        while True:
            result = self.textract.get_document_analysis(JobId=job_id)
            status = result['JobStatus']
            print(f"Job status: {status}")
            if status in ['SUCCEEDED', 'FAILED']:
                if status == 'FAILED':
                    raise Exception("Textract job failed.")
                break
            time.sleep(5)

        # Retrieve all results
        results = []
        next_token = None
        while True:
            response = self.textract.get_document_analysis(JobId=job_id, NextToken=next_token) if next_token else self.textract.get_document_analysis(JobId=job_id)
            results.append(response)
            next_token = response.get('NextToken')
            if not next_token:
                break

        # Extract layout and tables
        layout_df = self.extract_layout(results)
        tables = self.extract_tables(results)

        # Optionally save
        if save_to_disk:
            layout_df.to_csv(os.path.join(output_folder, "layout.csv"), index=False)
            for i, table in enumerate(tables, 1):
                with open(os.path.join(output_folder, f"table-{i}.csv"), "w") as f:
                    f.write(table)

        return layout_df, tables

    def generate_layout_figures(self, results, output_folder):
        """Save PNG images per page with layout bounding boxes."""
        page_images = []
        for page_num, page in enumerate(results, start=1):
            blocks = page['Blocks']
            layout_blocks = [b for b in blocks if b['BlockType'].startswith('LAYOUT')]
            image_block = next((b for b in blocks if b['BlockType'] == 'PAGE'), None)
            width, height = 1000, 1300  # fixed size for visualization

            # Create blank white image
            img = Image.new('RGB', (width, height), color='white')
            fig, ax = plt.subplots()
            ax.imshow(img)

            for block in layout_blocks:
                box = block['Geometry']['BoundingBox']
                left = box['Left'] * width
                top = box['Top'] * height
                box_width = box['Width'] * width
                box_height = box['Height'] * height

                rect = plt.Rectangle((left, top), box_width, box_height,
                                    linewidth=2, edgecolor='blue', facecolor='none')
                ax.add_patch(rect)
                ax.text(left, top - 10, block['BlockType'].replace('LAYOUT_', ''), color='blue', fontsize=8)

            ax.axis('off')
            fig.tight_layout(pad=0)
            fig.canvas.draw()

            # Save image
            img_path = os.path.join(output_folder, f"layout_page_{page_num}.png")
            fig.savefig(img_path, bbox_inches='tight', pad_inches=0)
            plt.close(fig)

            page_images.append(img_path)

        return page_images

    def extract_layout(self, results):
        rows = []
        layout_counters = defaultdict(int)
        reading_order = 0
        line_map = {}
        output_folder = os.path.join(self.base_dir, "figures")
        os.makedirs(output_folder, exist_ok=True)
        page_figures = self.generate_layout_figures(results, output_folder)

        for page_num, page in enumerate(results, start=1):
            line_map.update({b['Id']: b for b in page['Blocks'] if b['BlockType'] == 'LINE'})
            layout_blocks = [b for b in page['Blocks'] if b['BlockType'].startswith('LAYOUT')]

            for block in layout_blocks:
                layout_key = block['BlockType'].replace('LAYOUT_', '').capitalize()
                layout_counters[layout_key] += 1
                layout_label = f"{layout_key} {layout_counters[layout_key]}"

                line_text = ''
                for rel in block.get('Relationships', []):
                    if rel.get('Type') == 'CHILD':
                        line_text = ' '.join(line_map.get(i, {}).get('Text', '') for i in rel.get('Ids', []) if i in line_map)

                rows.append({
                    'Page number': block.get('Page', 1),
                    'Layout': layout_label,
                    'Text': line_text.strip(),
                    'Reading Order': reading_order,
                    'Confidence score % (Layout)': block['Confidence'],
                    'Figure': page_figures[page_num - 1]
                })
                reading_order += 1

        return pd.DataFrame(rows)


    def extract_tables(self, results):
        table_index = 1
        tables_output = []
        for page in results:
            blocks = page['Blocks']
            blocks_map = {b['Id']: b for b in blocks}
            table_blocks = [b for b in blocks if b['BlockType'] == 'TABLE']
            for table in table_blocks:
                rows, scores = self.get_table_rows(table, blocks_map)
                table_id = f"Table_{table_index}"
                table_csv = f"Table: {table_id}\n\n"

                col_count = 0
                for row in sorted(rows):
                    cols = rows[row]
                    col_count = len(cols)
                    table_csv += ','.join(cols[col] for col in sorted(cols)) + '\n'

                table_csv += '\n\n Confidence Scores % (Table Cell) \n'
                for i, score in enumerate(scores, 1):
                    table_csv += score + (',' if i % col_count else '\n')

                tables_output.append(table_csv)
                table_index += 1
        return tables_output

    def get_text(self, cell, blocks_map):
        text = ''
        for rel in cell.get('Relationships', []):
            if rel['Type'] == 'CHILD':
                for cid in rel['Ids']:
                    word = blocks_map.get(cid, {})
                    if word['BlockType'] == 'WORD':
                        t = word['Text']
                        text += f'"{t}" ' if "," in t and t.replace(",", "").isnumeric() else f"{t} "
                    elif word['BlockType'] == 'SELECTION_ELEMENT' and word.get('SelectionStatus') == 'SELECTED':
                        text += 'X '
        return text.strip()

    def get_table_rows(self, table, blocks_map):
        rows, scores = {}, []
        for rel in table.get('Relationships', []):
            if rel['Type'] == 'CHILD':
                for cid in rel['Ids']:
                    cell = blocks_map.get(cid)
                    if cell and cell['BlockType'] == 'CELL':
                        row, col = cell['RowIndex'], cell['ColumnIndex']
                        rows.setdefault(row, {})[col] = self.get_text(cell, blocks_map)
                        scores.append(str(cell['Confidence']))
        return rows, scores

In [25]:
processor = TextractProcessor(base_dir="output")
layout_df = processor.process_pdf("test_info_extract.pdf")

# Display the final DataFrame
print("\n--- Final Extracted Layout ---")
print(layout_df.to_string())

Uploaded to S3: output/test_info_extract.pdf
Textract Job Started: 91a8ff8f1353db66563e2c64779ae51d70890b480495ee2f425397ba5a490b7d
Job status: IN_PROGRESS
Job status: SUCCEEDED

Extracting figure images from PDF...
Extracted and saved 12 figures.

Saved final layout CSV to: output\test_info_extract\layout.csv

--- Final Extracted Layout ---
    Page number     Layout                                                                                                                                                                                                     Text  Reading Order  Confidence score % (Layout)
0             1    Title 1                                                                                                                                                                          Home Energy Report: electricity              0                    30.859375
1             1     Text 1                                                                                      

In [27]:
import os
import time
import boto3
import json
from dotenv import load_dotenv

load_dotenv()

class TextractJobRunner:
    def __init__(self, base_dir, bucket_name=None):
        self.textract = boto3.client(
            'textract',
            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name=os.getenv("AWS_DEFAULT_REGION")
        )
        self.s3 = boto3.client(
            's3',
            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
            region_name=os.getenv("AWS_DEFAULT_REGION")
        )
        self.bucket_name = bucket_name or os.getenv("AWS_BUCKET_NAME")
        self.base_dir = base_dir

    def run_job_and_save_response(self, file_path):
        """
        Takes a PDF, runs Textract analysis, and saves the raw JSON output.
        """
        file_name_no_ext = os.path.splitext(os.path.basename(file_path))[0]
        output_folder = os.path.join(self.base_dir, file_name_no_ext)
        os.makedirs(output_folder, exist_ok=True)

        # Upload to S3
        s3_key = f'textract-analysis/{os.path.basename(file_path)}'
        self.s3.upload_file(file_path, self.bucket_name, s3_key)
        print(f"Uploaded to S3: {s3_key}")

        # Start Textract job
        # Note: We are still asking for LAYOUT and TABLES to get the most detailed JSON
        response = self.textract.start_document_analysis(
            DocumentLocation={'S3Object': {'Bucket': self.bucket_name, 'Name': s3_key}},
            FeatureTypes=['LAYOUT', 'TABLES']
        )
        job_id = response['JobId']
        print(f"Textract Job Started: {job_id}")

        # Poll for completion
        while True:
            result = self.textract.get_document_analysis(JobId=job_id)
            status = result['JobStatus']
            print(f"Job status: {status}")
            if status in ['SUCCEEDED', 'FAILED']:
                if status == 'FAILED':
                    raise Exception("Textract job failed.")
                break
            time.sleep(5)

        # Retrieve all results using pagination
        results = []
        next_token = None
        while True:
            response = self.textract.get_document_analysis(JobId=job_id, NextToken=next_token) if next_token else self.textract.get_document_analysis(JobId=job_id)
            results.append(response)
            next_token = response.get('NextToken')
            if not next_token:
                break
        
        print(f"Retrieved {len(results)} pages of results.")

        # Save the raw results to a JSON file
        json_path = os.path.join(output_folder, "raw_textract_response.json")
        print(f"\nSaving raw Textract response to: {json_path}")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
            
        print("Raw JSON response saved successfully.")
        return json_path

# --- How to run this script ---
if __name__ == '__main__':
    # Initialize the class with your base output directory
    processor = TextractJobRunner(base_dir="output")
    
    # Run the process for your PDF
    processor.run_job_and_save_response("test_info_extract.pdf")

Uploaded to S3: textract-analysis/test_info_extract.pdf
Textract Job Started: 26042474975f6211068256a399faa8d7a88b57c2d1df8046d5c5019e6203eb96
Job status: IN_PROGRESS
Job status: SUCCEEDED
Retrieved 1 pages of results.

Saving raw Textract response to: output\test_info_extract\raw_textract_response.json
Raw JSON response saved successfully.


In [41]:
import os
import json
import pandas as pd
from collections import defaultdict
import fitz

def save_layout(results, output_folder):
    """
    This is your original function, refined to ensure correct reading order.
    """
    rows = []
    layout_counters = defaultdict(int)
    reading_order = 0
    line_map = {}
    
    # Your original loop structure is preserved.
    for page in results:
        # A line map is created for each page's blocks.
        line_map.update({b['Id']: b for b in page['Blocks'] if b['BlockType'] == 'LINE'})
        layout_blocks = [b for b in page['Blocks'] if b['BlockType'].startswith('LAYOUT')]

        for block in layout_blocks:
            layout_key = block['BlockType'].replace('LAYOUT_', '').capitalize()
            layout_counters[layout_key] += 1
            layout_label = f"{layout_key} {layout_counters[layout_key]}"

            line_text = ''
            for rel in block.get('Relationships', []):
                if rel.get('Type') == 'CHILD':
                    line_text = ' '.join(line_map.get(i, {}).get('Text', '') for i in rel.get('Ids', []) if i in line_map)

            rows.append({
                'Page number': block.get('Page', 1),
                'Layout': layout_label,
                'Text': line_text.strip(),
                'Reading Order': reading_order,
                'Confidence score % (Layout)': block.get('Confidence', 0.0)
            })
            reading_order += 1

    layout_path = os.path.join(output_folder, 'layout.csv')
    pd.DataFrame(rows).to_csv(layout_path, index=False)
    print(f"Saved layout to {layout_path}")

def save_figures(results, pdf_path, output_folder):
    """
    REFINED: This function now correctly receives the data it needs to run.
    """
    print("\nExtracting and saving figures...")
    os.makedirs(output_folder, exist_ok=True)
    
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF '{pdf_path}': {e}")
        return

    all_blocks = [block for page in results for block in page.get('Blocks', [])]
    figure_blocks = [b for b in all_blocks if b.get('BlockType') == 'LAYOUT_FIGURE']

    for i, block in enumerate(figure_blocks, 1):
        try:
            page_num = block.get('Page')
            page = doc.load_page(page_num - 1)
            box = block['Geometry']['BoundingBox']
            
            clip_rect = fitz.Rect(
                box['Left'] * page.rect.width, box['Top'] * page.rect.height,
                (box['Left'] + box['Width']) * page.rect.width,
                (box['Top'] + box['Height']) * page.rect.height
            )
            
            pix = page.get_pixmap(clip=clip_rect, dpi=200)
            
            output_path = os.path.join(output_folder, f"figure_{i}.png")
            pix.save(output_path)
        except Exception as e:
            print(f"Could not save figure_{i}.png. Error: {e}")
            
    doc.close()
    print(f"Saved {len(figure_blocks)} figures to '{output_folder}'.")

# --- Example of how to run this function ---
if __name__ == '__main__':
    # Define the names of your main output folder and the subfolder
    base_output_dir = "output"
    document_folder = "test_info_extract"
    pdf_file_path = "test_info_extract.pdf"
    output_folder = os.path.join(base_output_dir, document_folder, "figures")

    # 1. Build the path to the output folder where the JSON is located
    #    This creates "output/test_info_extract"
    output_folder_path = os.path.join(base_output_dir, document_folder)
    
    # 2. Build the full path to your raw JSON response file
    json_file_path = os.path.join(output_folder_path, "raw_textract_response.json")

    with open(json_file_path, 'r', encoding='utf-8') as f:
        results_data = json.load(f)
    
    # 4. Call your refined function to create the CSV
    save_layout(results_data, output_folder_path)
    save_figures(results_data, pdf_file_path, output_folder)

Saved layout to output\test_info_extract\layout.csv

Extracting and saving figures...
Saved 12 figures to 'output\test_info_extract\figures'.


In [43]:
import pandas as pd
import os
import re

def create_markdown_file(dataframe, output_folder):
    """
    Generates a markdown file from the final, corrected DataFrame.
    """
    print("\nCreating Markdown file...")
    md_content = []
    for index, row in dataframe.iterrows():
        layout_type = str(row.get('Layout', '')).split(' ')[0]
        text = str(row.get('Text', ''))

        if not text:
            continue

        if layout_type == 'Title':
            md_content.append(f"# {text}\n")
        elif layout_type == 'Header':
            md_content.append(f"## {text}\n")
        elif layout_type == 'Figure':
            # Assumes the 'Text' column contains the relative path to the figure
            md_content.append(f"![{text}]({text})\n")
        else: # For 'Text', 'List', etc.
            md_content.append(f"{text}\n")
    
    # Save the final markdown content to a file
    md_path = os.path.join(output_folder, 'final_layout.md')
    with open(md_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(md_content))
    print(f"Successfully created Markdown file: {md_path}")


def integrate_figures_and_create_outputs(csv_path, figures_folder):
    """
    Reads a layout.csv, integrates figure filenames, saves the corrected CSV,
    and then generates a corresponding markdown file.
    """
    print(f"Reading layout CSV: '{csv_path}'")
    try:
        df = pd.read_csv(csv_path, na_filter=False)
    except FileNotFoundError:
        print(f"Error: The file '{csv_path}' was not found.")
        return

    # 1. Find and sort all figure image files
    print(f"Finding figure images in: '{figures_folder}'")
    try:
        figure_files = sorted(
            [f for f in os.listdir(figures_folder) if f.startswith('figure_') and f.endswith('.png')],
            key=lambda x: int(re.search(r'figure_(\d+)\.png', x).group(1))
        )
        print(f"Found {len(figure_files)} figure files.")
    except (FileNotFoundError, AttributeError, TypeError):
        print(f"Error: Could not find or parse figure files in '{figures_folder}'.")
        return

    # 2. Update the 'Text' column for each matched figure
    figure_rows_indices = df[df['Layout'].str.startswith('Figure', na=False)].index
    for i, df_index in enumerate(figure_rows_indices):
        if i < len(figure_files):
            # Create a relative path to the figure for the CSV
            figure_path = os.path.join(os.path.basename(figures_folder), figure_files[i])
            df.loc[df_index, 'Text'] = figure_path
            print(f"Updating '{df.loc[df_index, 'Layout']}' with text: '{figure_path}'")

    # 3. Save the updated DataFrame to a new CSV file
    output_dir = os.path.dirname(csv_path) or '.'
    output_csv_path = os.path.join(output_dir, 'layout_with_figures.csv')
    df.to_csv(output_csv_path, index=False)
    print(f"\nSuccessfully created new CSV with integrated figure filenames: {output_csv_path}")

    # 4. Generate the Markdown file from the now-corrected DataFrame
    create_markdown_file(df, output_dir)


# --- How to Run This Script ---
if __name__ == '__main__':
    # --- Configuration ---
    
    # 1. The path to the layout.csv file created by the previous script
    CSV_PATH = 'output/test_info_extract/layout.csv'
    
    # 2. The path to the folder where your figures were saved
    FIGURES_FOLDER_PATH = 'output/test_info_extract/figures'
    
    # ---------------------

    integrate_figures_and_create_outputs(CSV_PATH, FIGURES_FOLDER_PATH)

Reading layout CSV: 'output/test_info_extract/layout.csv'
Finding figure images in: 'output/test_info_extract/figures'
Found 12 figure files.
Updating 'Figure 1' with text: 'figures\figure_1.png'
Updating 'Figure 2' with text: 'figures\figure_2.png'
Updating 'Figure 3' with text: 'figures\figure_3.png'
Updating 'Figure 4' with text: 'figures\figure_4.png'
Updating 'Figure 5' with text: 'figures\figure_5.png'
Updating 'Figure 6' with text: 'figures\figure_6.png'
Updating 'Figure 7' with text: 'figures\figure_7.png'
Updating 'Figure 8' with text: 'figures\figure_8.png'
Updating 'Figure 9' with text: 'figures\figure_9.png'
Updating 'Figure 10' with text: 'figures\figure_10.png'
Updating 'Figure 11' with text: 'figures\figure_11.png'
Updating 'Figure 12' with text: 'figures\figure_12.png'

Successfully created new CSV with integrated figure filenames: output/test_info_extract\layout_with_figures.csv

Creating Markdown file...
Successfully created Markdown file: output/test_info_extract\fin

## Summarize the data

Create a summary of each element extracted from the PDF. This summary will be vectorized and used in the retrieval process.

### Text and Table summaries

We don't need a multimodal model to generate the summaries of the tables and the text. I will use open source models available on Groq.

### Image summaries

We will use gpt-4o-mini to produce the image summaries.

In [1]:
%pip install -Uq langchain_openai

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.3.25 requires langsmith<0.4,>=0.1.17, but you have langsmith 0.4.6 which is incompatible.
langchain-community 0.3.23 requires langsmith<0.4,>=0.1.125, but you have langsmith 0.4.6 which is incompatible.


In [45]:
import pandas as pd

df = pd.read_csv('output/test_info_extract/layout_with_figures.csv')

Unnamed: 0,Page number,Layout,Text,Reading Order,Confidence score % (Layout)
0,1,Title 1,Home Energy Report: electricity,0,30.859375
1,1,Text 1,March report Account number: 954137 Service ad...,1,26.977539
2,1,Text 2,"Dear JILL DOE, here is your usage analysis for...",2,84.863281
3,1,Text 3,Your electric use:,3,88.769531
4,1,Text 4,Above typical use,4,79.882812
5,1,Text 5,18% more than similar nearby homes,5,86.328125
6,1,Figure 1,figures\figure_1.png,6,89.599609
7,1,Text 6,Nearby homes are defined as,7,82.373047
8,1,Figure 2,figures\figure_2.png,8,66.259766
9,1,Text 7,Other homes with electricity,9,94.580078


In [55]:
import base64
import os
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

load_dotenv()

def encode_image_to_base64(image_path):
    """Encodes an image file to a base64 string."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Warning: Image file not found at {image_path}. Skipping.")
        return None

# --- 1. Load the Layout Data ---
csv_path = 'output/test_info_extract/layout_with_figures.csv'
df = pd.read_csv(csv_path)

# --- 2. Prepare the Contextual Prompts and Image Data ---
batch_input = []
print("Preparing contextual prompts for each figure...")
for i, row in df.iterrows():
    if str(row.get('Layout')).startswith('Figure'):
        image_path = row['Text']
        image_path = os.path.join("output", "test_info_extract", image_path)  # Ensure the path is correct
        # Get context from the row before the figure
        text_before = df.loc[i - 1, 'Text'] if i > 0 else "No text before."
        
        # Get context from the row after the figure
        text_after = df.loc[i + 1, 'Text'] if i < len(df) - 1 else "No text after."
        
        # Create a dynamic prompt with the context for this specific image
        dynamic_prompt = f"""Describe the image in detail. It is part of a home energy report.

CONTEXT BEFORE IMAGE: "{text_before}"
CONTEXT AFTER IMAGE: "{text_after}"

Based on the context above, analyze the image. Be specific about graphs, such as bar plots. If it's a simple icon with no data, just say 'icon' - no other explanation.
"""
        
        # Encode the image and add it to our batch list
        encoded_image = encode_image_to_base64(image_path)
        if encoded_image:
            batch_input.append({
                "image": encoded_image,
                "prompt_text": dynamic_prompt,
                "original_path": os.path.basename(image_path) # For printing results
            })

# --- 3. Define the LangChain Prompt Template and Chain ---
# The prompt template now includes a variable for our dynamic text
messages = [
    (
        "user",
        [
            {"type": "text", "text": "{prompt_text}"},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]

prompt = ChatPromptTemplate.from_messages(messages)

# Initialize the model and create the chain
# NOTE: Ensure your OPENAI_API_KEY environment variable is set
model = ChatOpenAI(model="gpt-4o-mini")
chain = prompt | model | StrOutputParser()

# --- 4. Run the Batch Process and Print Results ---
if batch_input:
    print(f"\nProcessing {len(batch_input)} images with unique contexts...")
    
    # The input is the list of dictionaries we prepared
    image_summaries = chain.batch(batch_input, {"max_concurrency": 5})

    print("\n--- Figure Descriptions with Context ---")
    for i, summary in enumerate(image_summaries):
        # Get the original filename from our input list for context
        filename = batch_input[i]['original_path']
        print(f"File: {filename}")
        print(f"Description: {summary}\n")
else:
    print("No figures found in the CSV to process.")

Preparing contextual prompts for each figure...

Processing 12 images with unique contexts...

--- Figure Descriptions with Context ---
File: figure_1.png
Description: The image features a bar graph that compares energy usage in kilowatt-hours (kWh) among three categories:

1. **You**: Represented by a large blue bar indicating a usage of 125 kWh.
2. **Similar nearby homes**: Displayed as an orange bar, showing a usage of 103 kWh.
3. **Efficient nearby homes**: Illustrated with a green bar, demonstrating the lowest usage at 49 kWh.

Each category is clearly labeled, and the graph visually emphasizes that the energy consumption from "You" is 18% higher than that of "Similar nearby homes." The layout allows for easy comparison among the three groups based on their respective energy usage.

File: figure_2.png
Description: icon

File: figure_3.png
Description: icon

File: figure_4.png
Description: icon

File: figure_5.png
Description: The image features a woman holding a colorful chart or 

In [58]:
import pandas as pd
import os

def update_csv_with_summaries(csv_path, summaries_list):
    """
    Updates the 'Text' column of figure rows in a CSV with provided summaries.
    """
    print(f"Reading layout CSV: '{csv_path}'")
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: The file '{csv_path}' was not found.")
        return

    # Find the indices of all rows where the 'Layout' is a figure
    figure_indices = df[df['Layout'].str.startswith('Figure', na=False)].index.tolist()

    # Check for a mismatch between the number of figures and summaries
    if len(figure_indices) != len(summaries_list):
        print(f"Warning: Mismatch found!")
        print(f"  - CSV has {len(figure_indices)} figure rows.")
        print(f"  - You provided {len(summaries_list)} summaries.")
        print("  - The script will update as many as it can match.")

    # Iterate through the figure rows and update the 'Text' column
    summary_counter = 0
    for idx in figure_indices:
        if summary_counter < len(summaries_list):
            original_layout_label = df.loc[idx, 'Layout']
            new_text = summaries_list[summary_counter]
            
            df.loc[idx, 'Text'] = new_text
            print(f"Updated '{original_layout_label}' with new summary.")
            
            summary_counter += 1

    # Save the final DataFrame to a new file
    output_dir = os.path.dirname(csv_path) or '.'
    output_path = os.path.join(output_dir, 'layout_with_summaries_2.csv')
    df.to_csv(output_path, index=False)
    
    print(f"\nSuccessfully created final CSV with summaries: {output_path}")

# --- How to Run This Script ---
if __name__ == '__main__':
    # --- Configuration ---
    
    # 1. This is the path to your CSV file that has figure filenames
    CSV_WITH_FIGURES_PATH = 'output/test_info_extract/layout_with_figures.csv'

    # ---------------------

    update_csv_with_summaries(CSV_WITH_FIGURES_PATH, image_summaries)

Reading layout CSV: 'output/test_info_extract/layout_with_figures.csv'
Updated 'Figure 1' with new summary.
Updated 'Figure 2' with new summary.
Updated 'Figure 3' with new summary.
Updated 'Figure 4' with new summary.
Updated 'Figure 5' with new summary.
Updated 'Figure 6' with new summary.
Updated 'Figure 7' with new summary.
Updated 'Figure 8' with new summary.
Updated 'Figure 9' with new summary.
Updated 'Figure 10' with new summary.
Updated 'Figure 11' with new summary.
Updated 'Figure 12' with new summary.

Successfully created final CSV with summaries: output/test_info_extract\layout_with_summaries_2.csv


### Create the vectorstore

In [60]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp310-cp310-win_amd64.whl.metadata (8.7 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp310-cp310-win_amd64.whl.metadata (5.1 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.35.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelem

In [None]:
import os
import uuid
import pandas as pd
from dotenv import load_dotenv

# LangChain Imports
from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# Load environment variables (for OPENAI_API_KEY)
load_dotenv()

def create_and_store_chunks(csv_path, collection_name, max_chars=500):
    """
    Reads a layout CSV, creates custom chunks, and stores their
    embeddings in a Chroma vector store.
    """
    # --- 1. Load and sort the data ---
    print(f"Loading data from '{csv_path}'...")
    try:
        df = pd.read_csv(csv_path, na_filter=False)
        df_sorted = df.sort_values(by='Reading Order').reset_index(drop=True)
    except (FileNotFoundError, KeyError) as e:
        print(f"Error loading or sorting CSV: {e}")
        return

    # --- 2. Create chunks with your custom logic ---
    print("Creating custom text chunks...")
    chunks = []
    current_chunk_text = ""
    for index, row in df_sorted.iterrows():
        text_to_add = f"{row['Layout']}: {row['Text']}"

        if not current_chunk_text:
            current_chunk_text = text_to_add
            continue

        if len(current_chunk_text) + len(text_to_add) + 1 > max_chars:
            chunks.append(current_chunk_text)
            current_chunk_text = text_to_add
        else:
            current_chunk_text += "\n" + text_to_add

    if current_chunk_text:
        chunks.append(current_chunk_text)

    print(f"Created {len(chunks)} chunks based on a {max_chars} character limit.")

    # --- 3. Convert chunks to LangChain Document objects ---
    documents_to_embed = [Document(page_content=chunk) for chunk in chunks]

    # --- 4. Embed and store the documents in ChromaDB ---
    print(f"\nEmbedding and storing documents in Chroma collection: '{collection_name}'...")
    vectorstore = Chroma.from_documents(
        documents=documents_to_embed,
        embedding=OpenAIEmbeddings(),
        collection_name=collection_name
    )

    print("\nSuccessfully created and stored embeddings in ChromaDB.")
    return vectorstore

# --- How to Run ---
if __name__ == '__main__':
    # 1. Path to your final CSV file with summaries
    CSV_PATH = 'output/test_info_extract/layout_with_summaries_2.csv'
    
    # 2. Name for your ChromaDB collection
    COLLECTION_NAME = "multimodal_rag_collection"

    # 3. Create the chunks, embed them, and store them
    vector_store = create_and_store_chunks(
        csv_path=CSV_PATH,
        collection_name=COLLECTION_NAME,
        max_chars=1000
    )

    # You can now use the 'vector_store' object to run similarity searches
    if vector_store:
        query = "What was the energy usage compared to neighbors?"
        results = vector_store.similarity_search(query)
        print(f"\n--- Example Similarity Search for: '{query}' ---")
        for doc in results:
            print(doc.page_content)
            print("-" * 30)

Loading data from 'output/test_info_extract/layout_with_summaries_2.csv'...
Creating custom text chunks...
Created 5 chunks based on a 1000 character limit.

Embedding and storing documents in Chroma collection: 'multimodal_rag_collection'...

Successfully created and stored embeddings in ChromaDB.

--- Example Similarity Search for: 'What was the energy usage compared to neighbors?' ---
Figure 1: The image features a bar graph that compares energy usage in kilowatt-hours (kWh) among three categories:

1. **You**: Represented by a large blue bar indicating a usage of 125 kWh.
2. **Similar nearby homes**: Displayed as an orange bar, showing a usage of 103 kWh.
3. **Efficient nearby homes**: Illustrated with a green bar, demonstrating the lowest usage at 49 kWh.

Each category is clearly labeled, and the graph visually emphasizes that the energy consumption from "You" is 18% higher than that of "Similar nearby homes." The layout allows for easy comparison among the three groups based on 

In [29]:
!pip install rank_bm25

Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [80]:
import os
import uuid
import base64
import pandas as pd
from dotenv import load_dotenv

# LangChain Imports
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.storage import InMemoryStore

load_dotenv()

# --- Encode image ---
def encode_image_base64(path):
    """
    CORRECTED: This function now expects a full, valid path.
    """
    if pd.isna(path) or not os.path.exists(path):
        print(f"Warning: Image file not found at '{path}'.")
        return ""
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

# --- 1. Load CSVs ---
df_full = pd.read_csv("output/test_info_extract/layout_with_figures.csv")
df_sum = pd.read_csv("output/test_info_extract/layout_with_summaries_2.csv")
assert len(df_full) == len(df_sum), "Row counts must match"

# --- 2. Build Parent & Child Docs (Corrected Loop) ---
parent_docs, child_docs, ids = [], [], []

# Define the base directory where your output is stored
base_dir = "output/test_info_extract"

for full, summ in zip(df_full.itertuples(), df_sum.itertuples()):
    doc_id = str(uuid.uuid4())
    image_str = ""  # CORRECTION: Initialize image_str for every row

    # Check if the row represents a figure
    if hasattr(full, 'Layout') and isinstance(full.Layout, str) and full.Layout.startswith("Figure"):
        if hasattr(full, 'Text') and pd.notna(full.Text):
            
            # CORRECTION: Construct the full, correct path to the image file
            # The 'Text' column contains a relative path like 'figures\figure_1.png'
            relative_image_path = full.Text.replace("\\", "/") # Normalize path separators
            figure_path = os.path.join(base_dir, relative_image_path)
            
            image_base64 = encode_image_base64(figure_path)
            
            if image_base64:
                image_str = f"\n[Image base64]: {image_base64}"

    # For figure rows, use the summary text. For text rows, use the original text.
    parent_text_content = summ.Text if "Figure" in full.Layout else full.Text
    summary_text_content = summ.Text

    # Construct the final content strings
    parent_text = f"{full.Layout}: {parent_text_content}{image_str}"
    summary_text = f"{summ.Layout}: {summary_text_content}{image_str}"

    parent_docs.append(Document(page_content=parent_text, metadata={"doc_id": doc_id}))
    child_docs.append(Document(page_content=summary_text, metadata={"doc_id": doc_id}))
    ids.append(doc_id)

print(f"Processed {len(parent_docs)} documents.")

# --- 3. Vectorstore + BM25 ---
emb = OpenAIEmbeddings()
vectorstore = Chroma(collection_name="multimodal_rag_summaries", embedding_function=emb)
vectorstore.add_documents(child_docs)

bm25 = BM25Retriever.from_documents(child_docs)
bm25.k = 5

semantic = vectorstore.as_retriever(search_kwargs={"k": 5})
hybrid = EnsembleRetriever(retrievers=[bm25, semantic], weights=[0.5, 0.5])

# --- 4. MultiVectorRetriever ---
store = InMemoryStore()
store.mset(list(zip(ids, parent_docs)))

multi_retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")

print("\nSuccessfully built the MultiVectorRetriever and EnsembleRetriever.")

Processed 38 documents.

Successfully built the MultiVectorRetriever and EnsembleRetriever.


In [84]:
import os
import uuid
import pandas as pd
from dotenv import load_dotenv
import boto3

# LangChain Imports
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.storage import InMemoryStore

load_dotenv()

# --- S3 Upload Function (Corrected with Pre-signed URL) ---
def upload_image_to_s3(local_path, bucket_name, s3_client):
    """
    Uploads a local image file to an S3 bucket and returns a pre-signed URL
    that is valid for 1 hour.
    """
    if not os.path.exists(local_path):
        print(f"Warning: Image file not found at '{local_path}'.")
        return ""
    
    s3_key = f"figures/{os.path.basename(local_path)}"
    
    try:
        s3_client.upload_file(local_path, bucket_name, s3_key)
        
        # --- THIS IS THE FIX ---
        # Generate a temporary, secure URL instead of a public one.
        # The URL will be valid for 3600 seconds (1 hour).
        url = s3_client.generate_presigned_url(
            'get_object',
            Params={'Bucket': bucket_name, 'Key': s3_key},
            ExpiresIn=3600
        )
        print(f"Successfully uploaded {local_path} and generated pre-signed URL.")
        return url
    except Exception as e:
        print(f"Error during S3 operation: {e}")
        return ""

# --- 1. Load CSVs and Initialize S3 Client ---
df_full = pd.read_csv("output/test_info_extract/layout_with_figures.csv")
df_sum = pd.read_csv("output/test_info_extract/layout_with_summaries_2.csv")
assert len(df_full) == len(df_sum), "Row counts must match"

s3_client = boto3.client('s3')
bucket_name = os.getenv("AWS_BUCKET_NAME")
if not bucket_name:
    raise ValueError("AWS_BUCKET_NAME not set in .env file.")

# --- 2. Build Parent & Child Docs with S3 URLs ---
parent_docs, child_docs, ids = [], [], []
base_dir = "output/test_info_extract"

for full, summ in zip(df_full.itertuples(), df_sum.itertuples()):
    doc_id = str(uuid.uuid4())
    image_url_str = ""

    if hasattr(full, 'Layout') and isinstance(full.Layout, str) and full.Layout.startswith("Figure"):
        if hasattr(full, 'Text') and pd.notna(full.Text):
            relative_image_path = full.Text.replace("\\", "/")
            figure_path = os.path.join(base_dir, relative_image_path)
            
            image_url = upload_image_to_s3(figure_path, bucket_name, s3_client)
            
            if image_url:
                image_url_str = f"\n[Image URL]: {image_url}"

    parent_text_content = summ.Text if "Figure" in full.Layout else full.Text
    summary_text_content = summ.Text

    parent_text = f"{full.Layout}: {parent_text_content}{image_url_str}"
    summary_text = f"{summ.Layout}: {summary_text_content}"

    parent_docs.append(Document(page_content=parent_text, metadata={"doc_id": doc_id}))
    child_docs.append(Document(page_content=summary_text, metadata={"doc_id": doc_id}))
    ids.append(doc_id)

print(f"\nProcessed {len(parent_docs)} documents.")

# --- 3. Vectorstore + BM25 ---
emb = OpenAIEmbeddings()
vectorstore = Chroma(collection_name="multimodal_rag_s3", embedding_function=emb)
vectorstore.add_documents(child_docs)

bm25 = BM25Retriever.from_documents(child_docs)
bm25.k = 5

semantic = vectorstore.as_retriever(search_kwargs={"k": 5})
hybrid = EnsembleRetriever(retrievers=[bm25, semantic], weights=[0.5, 0.5])

# --- 4. MultiVectorRetriever ---
store = InMemoryStore()
store.mset(list(zip(ids, parent_docs)))

multi_retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")

print("\nSuccessfully built the MultiVectorRetriever and EnsembleRetriever.")

Successfully uploaded output/test_info_extract\figures/figure_1.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_2.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_3.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_4.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_5.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_6.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_7.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_8.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_9.png and generated pre-signed URL.
Successfully uploaded output/test_info_extract\figures/figure_10.png and generated pre-signed URL.
Successfully upload

In [85]:
import os
import uuid
import base64
from dotenv import load_dotenv
from base64 import b64decode

import pandas as pd
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore

from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever

# --- Load environment and data ---
load_dotenv()

# --- Utility: Convert image file to base64-encoded string ---
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# --- Hybrid search logic with parent document fetching ---
def hybrid_search_with_parent(query: str, k: int = 5):
    child_hits = hybrid.invoke(query)
    seen_ids = []
    for doc in child_hits:
        doc_id = doc.metadata.get("doc_id")
        if doc_id and doc_id not in seen_ids:
            seen_ids.append(doc_id)
        if len(seen_ids) >= k:
            break
    parent_hits = store.mget(seen_ids)
    return [doc for doc in parent_hits if doc is not None]

retriever = RunnableLambda(lambda q: hybrid_search_with_parent(q))

# --- Parser: split base64-encoded images from regular text ---
def parse_docs(docs):
    b64, text = [], []
    for doc in docs:
        try:
            b64decode(doc.page_content)
            b64.append(doc.page_content)
        except Exception:
            text.append(doc)
    return {"images": b64, "texts": text}

# --- Prompt builder: includes image_urls from base64-encoded strings ---
def build_prompt(kwargs):
    docs_by_type = kwargs["context"]
    user_question = kwargs["question"]
    context_text = "".join(doc.page_content for doc in docs_by_type["texts"])

    prompt_template = f"""
    Answer the question based only on the following context, which can include text, tables, and images.
    Context: {context_text}
    Question: {user_question}
    """
    prompt_content = [{"type": "text", "text": prompt_template}]
    for image in docs_by_type["images"]:
        prompt_content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image}"},
        })
    print(ChatPromptTemplate.from_messages([HumanMessage(content=prompt_content)]))
    return ChatPromptTemplate.from_messages([HumanMessage(content=prompt_content)])

# --- Main RAG Chain ---
llm = ChatOpenAI(model="gpt-4o")

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | llm
    | StrOutputParser()
)

# --- Optional: Chain with sources ---
chain_with_sources = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=RunnableLambda(build_prompt) | llm | StrOutputParser()
)

In [86]:
query = "How much energy did I consume compared to my neighbors?"
response = chain.invoke(query)
print(response)

# Or with sources:
full_result = chain_with_sources.invoke(query)
print(full_result["response"])
print(full_result["context"])  # includes both base64 images and text Document objects


input_variables=[] input_types={} partial_variables={} messages=[HumanMessage(content=[{'type': 'text', 'text': '\n    Answer the question based only on the following context, which can include text, tables, and images.\n    Context: Text 22: Evaluate your energy efficiencyText 12: Waiting until you have a full load to run your laundry can save up to 6% of your energy use.Text 13: Watch this space for new ways to save energy each month.\n    Question: How much energy did I consume compared to my neighbors?\n    '}, {'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,Figure 10: The image is a line graph that compares electricity usage in kilowatt-hours (kWh) over several months, specifically from April to March, among three categories: "You," "Similar Homes," and "Efficient Homes."\r\n\r\n### Key Features:\r\n\r\n- **Axis**: \r\n  - The vertical axis (y-axis) represents electricity usage in kilowatt-hours (kWh), ranging from 0 to 200 kWh.\r\n  - The horizontal axis (x-axi

BadRequestError: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image has of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_format'}}

In [87]:
import os
import uuid
import pandas as pd
from dotenv import load_dotenv
import boto3 # Added for S3

# LangChain Imports
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage

load_dotenv()

# --- S3 Upload Function ---
def upload_image_to_s3(local_path, bucket_name, s3_client):
    """
    Uploads a local image file to S3 and returns a pre-signed URL.
    """
    if not os.path.exists(local_path):
        print(f"Warning: Image file not found at '{local_path}'.")
        return ""
    
    s3_key = f"figures/{os.path.basename(local_path)}"
    
    try:
        s3_client.upload_file(local_path, bucket_name, s3_key)
        # Generate a temporary, secure URL valid for 1 hour
        url = s3_client.generate_presigned_url(
            'get_object',
            Params={'Bucket': bucket_name, 'Key': s3_key},
            ExpiresIn=3600
        )
        return url
    except Exception as e:
        print(f"Error during S3 upload: {e}")
        return ""

# --- 1. Load CSVs and Initialize S3 ---
df_full = pd.read_csv("output/test_info_extract/layout_with_figures.csv")
df_sum = pd.read_csv("output/test_info_extract/layout_with_summaries_2.csv")
assert len(df_full) == len(df_sum), "Row counts must match"

s3_client = boto3.client('s3')
bucket_name = os.getenv("AWS_BUCKET_NAME")
if not bucket_name:
    raise ValueError("AWS_BUCKET_NAME not set in .env file.")

# --- 2. Build Parent & Child Docs with S3 URLs ---
parent_docs, child_docs, ids = [], [], []
base_dir = "output/test_info_extract"

print("Processing documents and uploading figures to S3...")
for full, summ in zip(df_full.itertuples(), df_sum.itertuples()):
    doc_id = str(uuid.uuid4())
    image_url_str = ""

    if hasattr(full, 'Layout') and isinstance(full.Layout, str) and full.Layout.startswith("Figure"):
        if hasattr(full, 'Text') and pd.notna(full.Text):
            relative_image_path = full.Text.replace("\\", "/")
            figure_path = os.path.join(base_dir, relative_image_path)
            
            # Upload image and get the URL
            image_url = upload_image_to_s3(figure_path, bucket_name, s3_client)
            
            if image_url:
                # Use a clear marker for the URL
                image_url_str = f"\n[IMAGE_URL_START]\n{image_url}\n[IMAGE_URL_END]"

    parent_text_content = summ.Text if "Figure" in full.Layout else full.Text
    summary_text_content = summ.Text

    parent_text = f"{full.Layout}: {parent_text_content}{image_url_str}"
    summary_text = f"{summ.Layout}: {summary_text_content}" # Searchable summary is text-only

    parent_docs.append(Document(page_content=parent_text, metadata={"doc_id": doc_id}))
    child_docs.append(Document(page_content=summary_text, metadata={"doc_id": doc_id}))
    ids.append(doc_id)

# --- 3. Setup Retrievers (Unchanged) ---
emb = OpenAIEmbeddings()
vectorstore = Chroma(collection_name="multimodal_rag_s3_final", embedding_function=emb)
vectorstore.add_documents(child_docs)

bm25 = BM25Retriever.from_documents(child_docs)
bm25.k = 5
semantic = vectorstore.as_retriever(search_kwargs={"k": 5})
hybrid = EnsembleRetriever(retrievers=[bm25, semantic], weights=[0.5, 0.5])

store = InMemoryStore()
store.mset(list(zip(ids, parent_docs)))
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
print("\nRetriever setup complete.")

# --- 4. Define the RAG Chain with Corrected Parsers ---

def parse_docs_for_urls(docs):
    """
    Parses retrieved documents by looking for the explicit image URL marker.
    """
    image_urls = []
    text_content = []
    
    for doc in docs:
        parts = doc.page_content.split("[IMAGE_URL_START]")
        text_content.append(parts[0])
        
        if len(parts) > 1:
            url_part = parts[1].split("[IMAGE_URL_END]")[0].strip()
            image_urls.append(url_part)
            
    return {"images": image_urls, "texts": text_content}

def build_prompt_with_urls(inputs):
    """
    Dynamically builds the prompt, adding image_url dicts if URLs are present.
    """
    context = inputs["context"]
    question = inputs["question"]
    
    context_text = "\n".join(context['texts']).strip()
    
    prompt_content = [{
        "type": "text",
        "text": f"""Answer the question based only on the following context, which can include text and images.

Context:
---
{context_text}
---

Question: {question}
"""
    }]
    
    # Add image URLs if any were found
    for url in context["images"]:
        prompt_content.append({
            "type": "image_url",
            "image_url": {"url": url}, # Directly use the URL
        })
        
    return [HumanMessage(content=prompt_content)]

# --- Main RAG Chain ---
llm = ChatOpenAI(model="gpt-4o", temperature=0)

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs_for_urls),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt_with_urls)
    | llm
    | StrOutputParser()
)

# --- Example Usage ---
if __name__ == "__main__":
    query = "Show me the graph comparing my energy usage to my neighbors and describe it."
    answer = chain.invoke(query)
    
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- FINAL ANSWER ---")
    print(answer)

Processing documents and uploading figures to S3...

Retriever setup complete.

--- QUERY ---
Show me the graph comparing my energy usage to my neighbors and describe it.

--- FINAL ANSWER ---
The first graph is a bar graph comparing energy usage in kilowatt-hours (kWh) among three categories:

1. **You**: Represented by a large blue bar indicating a usage of 125 kWh.
2. **Similar nearby homes**: Displayed as an orange bar, showing a usage of 103 kWh.
3. **Efficient nearby homes**: Illustrated with a green bar, demonstrating the lowest usage at 49 kWh.

The graph emphasizes that your energy consumption is 18% higher than that of similar nearby homes.

The second graph is a line graph showing electricity usage over several months (April to March) among the same three categories:

- **You**: Represented by a blue line, showing varying usage levels throughout the months.
- **Similar Homes**: Indicated by an orange line, generally staying above your usage.
- **Efficient Homes**: Shown with

In [88]:
# --- Example Usage ---
if __name__ == "__main__":
    query = "Show me the graph comparing my energy usage to my neighbors and describe it."
    answer = chain.invoke(query)
    
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- FINAL ANSWER ---")
    print(answer)

BadRequestError: Error code: 400 - {'error': {'message': 'Error while downloading https://capstone-bucket-alister.s3.amazonaws.com/figures/figure_10.png?AWSAccessKeyId=AKIAWQUOZ4236TFSWCUI&Signature=sdsSBvZoovrStBlZSdTw61Oonmg%3D&Expires=1752744629.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_url'}}

In [91]:
import os
import uuid
import pandas as pd
from dotenv import load_dotenv
import boto3

# LangChain Imports
from langchain.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage

load_dotenv()

# --- S3 Upload Function ---
def upload_image_to_s3(local_path, bucket_name, s3_client):
    """
    Uploads a local image file to S3 and returns its permanent S3 key.
    """
    if not os.path.exists(local_path):
        print(f"Warning: Image file not found at '{local_path}'.")
        return ""
    
    s3_key = f"figures/{os.path.basename(local_path)}"
    
    try:
        s3_client.upload_file(local_path, bucket_name, s3_key)
        print(f"Successfully uploaded {local_path} to S3 with key: {s3_key}")
        return s3_key # Return the key, not the URL
    except Exception as e:
        print(f"Error during S3 upload: {e}")
        return ""

# --- 1. Load CSVs and Initialize S3 ---
df_full = pd.read_csv("output/test_info_extract/layout_with_figures.csv")
df_sum = pd.read_csv("output/test_info_extract/layout_with_summaries_2.csv")
assert len(df_full) == len(df_sum), "Row counts must match"

s3_client = boto3.client('s3')
bucket_name = os.getenv("AWS_BUCKET_NAME")
if not bucket_name:
    raise ValueError("AWS_BUCKET_NAME not set in .env file.")

# --- 2. Build Parent & Child Docs with S3 Keys ---
parent_docs, child_docs, ids = [], [], []
base_dir = "output/test_info_extract"

print("Processing documents and uploading figures to S3...")
for full, summ in zip(df_full.itertuples(), df_sum.itertuples()):
    doc_id = str(uuid.uuid4())
    s3_key_str = ""

    if hasattr(full, 'Layout') and isinstance(full.Layout, str) and full.Layout.startswith("Figure"):
        if hasattr(full, 'Text') and pd.notna(full.Text):
            relative_image_path = full.Text.replace("\\", "/")
            figure_path = os.path.join(base_dir, relative_image_path)
            
            # Upload image and get the permanent S3 key
            s3_key = upload_image_to_s3(figure_path, bucket_name, s3_client)
            
            if s3_key:
                # Store the key in the document with a clear marker
                s3_key_str = f"\n[S3_KEY_START]\n{s3_key}\n[S3_KEY_END]"

    parent_text_content = summ.Text if "Figure" in full.Layout else full.Text
    summary_text_content = summ.Text

    parent_text = f"{full.Layout}: {parent_text_content}{s3_key_str}"
    summary_text = f"{summ.Layout}: {summary_text_content}"

    parent_docs.append(Document(page_content=parent_text, metadata={"doc_id": doc_id}))
    child_docs.append(Document(page_content=summary_text, metadata={"doc_id": doc_id}))
    ids.append(doc_id)

# --- 3. Setup Retrievers (Unchanged) ---
emb = OpenAIEmbeddings()
vectorstore = Chroma(collection_name="multimodal_rag_s3_on_the_fly", embedding_function=emb)
vectorstore.add_documents(child_docs)

bm25 = BM25Retriever.from_documents(child_docs)
bm25.k = 5
semantic = vectorstore.as_retriever(search_kwargs={"k": 5})
hybrid = EnsembleRetriever(retrievers=[bm25, semantic], weights=[0.5, 0.5])

store = InMemoryStore()
store.mset(list(zip(ids, parent_docs)))
retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key="doc_id")
print("\nRetriever setup complete.")

# --- 4. Define the RAG Chain with On-the-Fly URL Generation ---

def parse_docs_and_generate_urls(docs):
    """
    Parses retrieved docs for S3 keys and generates fresh pre-signed URLs.
    """
    image_urls = []
    text_content = []
    
    for doc in docs:
        parts = doc.page_content.split("[S3_KEY_START]")
        text_content.append(parts[0])
        
        if len(parts) > 1:
            s3_key = parts[1].split("[S3_KEY_END]")[0].strip()
            # Generate a fresh URL right now
            url = s3_client.generate_presigned_url(
                'get_object',
                Params={'Bucket': bucket_name, 'Key': s3_key},
                ExpiresIn=3600 # Valid for 1 hour
            )
            image_urls.append(url)
            
    return {"images": image_urls, "texts": text_content}

def build_prompt_with_urls(inputs):
    """
    Dynamically builds the prompt, adding image_url dicts if URLs are present.
    """
    context = inputs["context"]
    question = inputs["question"]
    
    context_text = "\n".join(context['texts']).strip()
    
    prompt_content = [{
        "type": "text",
        "text": f"""Answer the question based only on the following context.

Context:
---
{context_text}
---

Question: {question}
"""
    }]
    
    for url in context["images"]:
        prompt_content.append({
            "type": "image_url",
            "image_url": {"url": url},
        })
        
    return [HumanMessage(content=prompt_content)]

# --- Main RAG Chain ---
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs_and_generate_urls),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt_with_urls)
    | llm
    | StrOutputParser()
)

Processing documents and uploading figures to S3...
Successfully uploaded output/test_info_extract\figures/figure_1.png to S3 with key: figures/figure_1.png
Successfully uploaded output/test_info_extract\figures/figure_2.png to S3 with key: figures/figure_2.png
Successfully uploaded output/test_info_extract\figures/figure_3.png to S3 with key: figures/figure_3.png
Successfully uploaded output/test_info_extract\figures/figure_4.png to S3 with key: figures/figure_4.png
Successfully uploaded output/test_info_extract\figures/figure_5.png to S3 with key: figures/figure_5.png
Successfully uploaded output/test_info_extract\figures/figure_6.png to S3 with key: figures/figure_6.png
Successfully uploaded output/test_info_extract\figures/figure_7.png to S3 with key: figures/figure_7.png
Successfully uploaded output/test_info_extract\figures/figure_8.png to S3 with key: figures/figure_8.png
Successfully uploaded output/test_info_extract\figures/figure_9.png to S3 with key: figures/figure_9.png
Suc

In [92]:
# --- Example Usage ---
if __name__ == "__main__":
    query = "Show me the graph comparing my energy usage to my neighbors and describe it."
    answer = chain.invoke(query)
    
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- FINAL ANSWER ---")
    print(answer)



--- QUERY ---
Show me the graph comparing my energy usage to my neighbors and describe it.

--- FINAL ANSWER ---
The first graph is a bar graph comparing energy usage in kilowatt-hours (kWh) among three categories:

1. **You**: Represented by a large blue bar indicating a usage of **125 kWh**.
2. **Similar nearby homes**: Displayed as an orange bar, showing a usage of **103 kWh**.
3. **Efficient nearby homes**: Illustrated with a green bar, demonstrating the lowest usage at **49 kWh**.

This graph clearly shows that your energy consumption is **18% higher** than that of similar nearby homes.

The second graph is a line graph that tracks electricity usage over several months from April to March among the same three categories:

- **You**: Represented by a blue line, showing fluctuating usage levels throughout the months.
- **Similar Homes**: Indicated by an orange line, which generally stays above your usage.
- **Efficient Homes**: Shown with a green line, consistently indicating the l

In [93]:
# --- Example Usage ---
if __name__ == "__main__":
    query = "Give me tips on how to reduce my energy consumption based on the data."
    answer = chain.invoke(query)
    
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- FINAL ANSWER ---")
    print(answer)



--- QUERY ---
Give me tips on how to reduce my energy consumption based on the data.

--- FINAL ANSWER ---
Based on the context provided, here are some tips to reduce energy consumption:

1. Implement energy-saving practices in the kitchen, such as using energy-efficient appliances and cooking in batches to minimize oven use.
2. Optimize laundry practices by washing clothes in cold water and only running full loads to save on electricity.
3. Consider other areas of your home where energy can be saved, such as using LED lighting and unplugging devices when not in use.


In [96]:
# --- Example Usage ---
if __name__ == "__main__":
    query = "For what month is this data?"
    answer = chain.invoke(query)
    
    print("\n--- QUERY ---")
    print(query)
    
    print("\n--- FINAL ANSWER ---")
    print(answer)



--- QUERY ---
For what month is this data?

--- FINAL ANSWER ---
The data is for the month of March.
