### Create index

In [3]:
import os
import time
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key)

# Create a serverless index
index_name = "my-test-index"

# if not pc.has_index(index_name):
pc.create_index(
    name=index_name,
    dimension=3072,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

### Parsing PDFs and Extracting Visual Information

In [3]:
import base64
import requests
import os
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
from pdf2image import convert_from_bytes
from io import BytesIO
from openai import AzureOpenAI
from tqdm import tqdm

# Link to the document we will use as the example 
document_to_parse = "https://documents1.worldbank.org/curated/en/099101824180532047/pdf/BOSIB13bdde89d07f1b3711dd8e86adb477.pdf"

# OpenAI client 
# oai_client = OpenAI()
oai_client = AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-08-01-preview",
)

# Chunk the PDF document into single page chunks 
def chunk_document(document_url):
    # Download the PDF document
    response = requests.get(document_url)
    pdf_data = response.content

    # Read the PDF data using PyPDF2
    pdf_reader = PdfReader(BytesIO(pdf_data))
    page_chunks = []

    for page_number, page in enumerate(pdf_reader.pages, start=1):
        pdf_writer = PdfWriter()
        pdf_writer.add_page(page)
        pdf_bytes_io = BytesIO()
        pdf_writer.write(pdf_bytes_io)
        pdf_bytes_io.seek(0)
        pdf_bytes = pdf_bytes_io.read()
        page_chunk = {
            'pageNumber': page_number,
            'pdfBytes': pdf_bytes
        }
        page_chunks.append(page_chunk)

    return page_chunks


# Function to encode the image
def encode_image(local_image_path):
    with open(local_image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


# Function to convert page to image     
def convert_page_to_image(pdf_bytes, page_number):
    # Convert the PDF page to an image
    images = convert_from_bytes(pdf_bytes)
    image = images[0]  # There should be only one page

    # Define the directory to save images (relative to your script)
    images_dir = 'images'  # Use relative path here

    # Ensure the directory exists
    os.makedirs(images_dir, exist_ok=True)

    # Save the image to the images directory
    image_file_name = f"page_{page_number}.png"
    image_file_path = os.path.join(images_dir, image_file_name)
    image.save(image_file_path, 'PNG')

    # Return the relative image path
    return image_file_path


# Pass the image to the LLM for interpretation  
def get_vision_response(prompt, image_path):
    # Getting the base64 string
    base64_image = encode_image(image_path)

    response = oai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        },
                    },
                ],
            }
        ],
    )
    return response


# Process document function that brings it all together 
def process_document(document_url):
    try:
        # Update document status to 'Processing'
        print("Document processing started")

        # Get per-page chunks
        page_chunks = chunk_document(document_url)
        total_pages = len(page_chunks)

        # Prepare a list to collect page data
        page_data_list = []

        # Add progress bar here
        for page_chunk in tqdm(page_chunks, total=total_pages, desc='Processing Pages'):
            page_number = page_chunk['pageNumber']
            pdf_bytes = page_chunk['pdfBytes']

            # Convert page to image
            image_path = convert_page_to_image(pdf_bytes, page_number)

            # Prepare question for vision API
            system_prompt = (
                "The user will provide you an image of a document file. Perform the following actions: "
                "1. Transcribe the text on the page. **TRANSCRIPTION OF THE TEXT:**"
                "2. If there is a chart, describe the image and include the text **DESCRIPTION OF THE IMAGE OR CHART**"
                "3. If there is a table, transcribe the table and include the text **TRANSCRIPTION OF THE TABLE**"
            )

            # Get vision API response
            vision_response = get_vision_response(system_prompt, image_path)

            # Extract text from vision response
            text = vision_response.choices[0].message.content

            # Collect page data
            page_data = {
                'PageNumber': page_number,
                'ImagePath': image_path,
                'PageText': text
            }
            page_data_list.append(page_data)

        # Create DataFrame from page data
        pdf_df = pd.DataFrame(page_data_list)
        print("Document processing completed.")
        print("DataFrame created with page data.")

        # Return the DataFrame
        return pdf_df

    except Exception as err:
        print(f"Error processing document: {err}")
        # Update document status to 'Error'


df = process_document(document_to_parse)

Document processing started


Processing Pages: 100%|██████████| 49/49 [14:58<00:00, 18.33s/it]

Document processing completed.
DataFrame created with page data.





In [4]:
from IPython.display import display, HTML

# Convert the DataFrame to an HTML table and display top 5 rows 
display(HTML(df.head().to_html()))

Unnamed: 0,PageNumber,ImagePath,PageText
0,1,images/page_1.png,**TRANSCRIPTION OF THE TEXT:**\n\nA BETTER BANK FOR A BETTER WORLD\n\nANNUAL REPORT 2024\n\nWORLD BANK GROUP\nIBRD - IDA\n\nPublic Disclosure Authorized\nPublic Disclosure Authorized
1,2,images/page_2.png,"**TRANSCRIPTION OF THE TEXT:**\n\nCONTENTS\nMessage from the President \t6\nMessage from the Executive Directors \t8\nBecoming a Better Bank \t10\nFiscal 2024 Financial Summary \t12\nResults by Region \t14\nResults by Theme \t44\nHow We Work \t68\n\nKEY TABLES\nIBRD Key Financial Indicators, Fiscal 2020–24 \t84\nIDA Key Financial Indicators, Fiscal 2020–24 \t88\n\nThis annual report, which covers the period from July 1, 2023, to June 30, 2024, has been prepared by the Executive Directors of both the International Bank for Reconstruction and Development (IBRD) and the International Development Association (IDA)—collectively known as the World Bank—in accordance with the respective bylaws of the two institutions. Ajay Banga, President of the World Bank Group and Chairman of the Board of Executive Directors, has submitted this report, together with the accompanying administrative budgets and audited financial statements, to the Board of Governors.\n\nAnnual reports for the other World Bank Group institutions—the International Finance Corporation (IFC), the Multilateral Investment Guarantee Agency (MIGA), and the International Centre for Settlement of Investment Disputes (ICSID)—are published separately. Key highlights from each institution's annual report are available in the World Bank Group Annual Report Summary.\n\nThroughout the report, the term World Bank and the abbreviated Bank refer only to IBRD and IDA; the term World Bank Group and the abbreviated Bank Group refer to the five institutions. All dollar amounts used in this report are current U.S. dollars unless otherwise specified. Funds allocated to multiregional projects are accounted for by recipient country where possible in tables and text when referring to regional breakdowns. For sector and theme breakdowns, funds are accounted for by operation. Fiscal year commitments and disbursements data are in accordance with the audited figures reported in the IBRD and IDA Financial Statements and Management's Discussion and Analysis documents for fiscal 2024. As a result of rounding, numbers in tables may not add to totals, and percentages in figures may not add to 100.\n\n---\n\n**DESCRIPTION OF THE IMAGE OR CHART**\n\nThe image shows a close-up of a person's hand holding a bunch of rice stalks. The hand appears weathered, suggesting the person might be engaged in agricultural work. The background is blurred but appears to show a field or a similar outdoor setting with some greenery. The World Bank Annual Report 2024 is printed in the bottom left corner of the image."
2,3,images/page_3.png,"**TRANSCRIPTION OF THE TEXT:**\n\n**ABOUT US**\n\nThe World Bank Group is one of the world’s largest sources of funding and knowledge for developing countries. Our five institutions share a commitment to reducing poverty, increasing shared prosperity, and promoting sustainable development.\n\n**OUR VISION**\nOur vision is to create a world free of poverty on a livable planet.\n\n**OUR MISSION**\nOur mission is to end extreme poverty and boost shared prosperity on a livable planet. This is threatened by multiple, intertwined crises. Time is of the essence. We are building a better Bank to drive impactful development that is:\n- Inclusive of everyone, including women and young people;\n- Resilient to shocks, including against climate and biodiversity crises, pandemics and fragility;\n- Sustainable, through growth and job creation, human development, fiscal and debt management, food security and access to clean air, water, and affordable energy.\n\nTo achieve this, we will work with all clients as one World Bank Group, in close partnership with other multilateral institutions, the private sector, and civil society.\n\n**OUR CORE VALUES**\nOur work is guided by our core values: impact, integrity, respect, teamwork, and innovation. These inform everything we do, everywhere we work.\n\n**DESCRIPTION OF THE IMAGE OR CHART**\nThe image shows two people wearing white clothes, embracing each other. Behind them, there is an out-of-focus background that includes some greenery."
3,4,images/page_4.png,"**TRANSCRIPTION OF THE TEXT:**\n\nDRIVING ACTION, MEASURING RESULTS\n\nThe World Bank Group contributes to impactful, meaningful development results around the world. In the first half of fiscal 2024*, we:\n\nHelped feed 156 million people\n\nImproved schooling for 280 million students\n\nReached 287 million people living in poverty with effective social protection support†\n\nProvided healthy water, sanitation, and/or hygiene to 59 million people\n\nEnabled access to sustainable transportation for 77 million people\n\nProvided 17 gigawatts of renewable energy capacity\n\nCommitted to devote 45 percent of annual financing to climate action by 2025, deployed equally between mitigation and adaptation\n\n*The development of the new Scorecard is ongoing at the time of printing; therefore, this report can only account for results up to December 31, 2023. \n†As of the 2024 IMF-World Bank Group Annual Meetings, the final fiscal 2024 Scorecard data will be available at: https://scorecard.worldbankgroup.org\n\nIn fiscal 2024, the Bank Group announced the development of a new Scorecard that will track results across 22 indicators—a fraction of the previous 150—to provide a streamlined, clear picture of progress on all aspects of the Bank Group’s mission, from improving access to healthcare to making food systems sustainable to boosting private investment.\n\nFor the first time, the work of all Bank Group financing institutions will be tracked through the same set of indicators. The new Scorecard will track the Bank Group’s overarching vision of ending poverty on a livable planet.\n\n**DESCRIPTION OF THE IMAGE OR CHART:**\nThere are images associated with each of the bulleted points. These images depict the following:\n\n1. Food plates, representing helping feed 156 million people.\n2. Children in uniform, representing improved schooling for 280 million students.\n3. An individual with a child, representing reaching 287 million people living in poverty with effective social protection support.\n4. A person drinking water, representing providing healthy water, sanitation, and/or hygiene to 59 million people.\n5. A train, representing enabling access to sustainable transportation for 77 million people.\n6. A person working on power lines, representing providing 17 gigawatts of renewable energy capacity.\n7. A landscape of plants/trees, representing the commitment to devote 45 percent of annual financing to climate action by 2025.\n\nThese images supplement the statistics provided, visualizing the contributions and impact of the World Bank Group."
4,5,images/page_5.png,"**TRANSCRIPTION OF THE TEXT:**\n\n**MESSAGE FROM THE PRESIDENT**\n\n**DELIVERING ON OUR COMMITMENTS REQUIRES US TO DEVELOP NEW AND BETTER WAYS OF WORKING. IN FISCAL 2024, WE DID JUST THAT.**\n**AJAY BANGA**\n\nIn fiscal 2024, the World Bank Group adopted a bold new vision of a world free of poverty on a livable planet. To achieve this, the Bank Group is enacting reforms to become a better partner to governments, the private sector, and, ultimately, the people we serve. Rarely in our 80-year history has our work been more urgent. We face declining progress in our fight against poverty, an existential climate crisis, mounting public debt, food insecurity, an unequal pandemic recovery, and the effects of geopolitical conflict.\n\nResponding to these intertwined challenges requires a faster, simpler, and more efficient World Bank Group. We are refocusing to confront these challenges not just through funding, but with knowledge. Our Knowledge Compact for Action, published in fiscal 2024, details how we will empower all our Bank Group clients, public and private, by making our wealth of development knowledge more accessible. And we have reorganized the World Bank’s global practices into five Vice Presidencies—People, Prosperity, Planet, Infrastructure, and Digital—for more flexible and faster engagements with clients. Each of these units reached important milestones in fiscal 2024.\n\nWe are supporting countries in delivering quality, affordable health services to 1.5 billion people by 2030 so our children and grandchildren will lead healthier, better lives. This is part of our larger global effort to provide a basic standard of care through every stage of a person’s life—infancy, childhood, adolescence, and adulthood. To help people withstand food-affected shocks and crises, we are strengthening social protection services to support half a billion people by the end of 2030—aiming for half of these beneficiaries to be women.\n\nWe are helping developing countries create jobs and employment, the surest enablers of prosperity. In the next 10 years, 1.2 billion young people across the Global South will become working-age adults. Yet, in the same period and the same countries, only 424 million jobs are expected to be created. The cost of hundreds of millions of young people with no hope for a decent job or future is unimaginable, and we are working urgently to create opportunity for all.\n\nIn response to climate change—arguably the greatest challenge of our generation—we’re channeling 45 percent of annual financing to climate action by 2025, deployed equally between mitigation and adaptation. Among other efforts, we intend to launch at least 15 country-led methane-reduction programs by fiscal 2026, and our Forest Carbon Partnership Facility has helped strengthen high-integrity carbon markets.\n\nRecognizing that digitalization is the transformational opportunity of our time, we are collaborating with governments in more than 100 developing countries to enable digital economies. Our digital lending portfolio totaled $5.6 billion in commitments as of June 2024; and our new Digital Vice Presidency unit will lead our efforts to establish the foundations of a digital economy. Key initiatives include building and enhancing digital and data infrastructure, ensuring cybersecurity and data privacy for institutions, businesses, and citizens, and advancing digital government services.\n\nDelivering on our commitments requires us to develop new and better ways of working. In fiscal 2024, we did just that. We are squeezing up our balance sheet and finding new opportunities to take more risk and boost our lending. Our new crisis preparedness and response tools, Global Challenge Programs, and Livable Planet Fund demonstrate how we are modernizing our approach to better drive impact and outcomes. Our new Scorecard radically changes how we track results.\n\nBut we cannot achieve development on our own. We need partners from both the public and private sectors to join our efforts. That's why we are working closely with other multilateral development banks to improve the lives of people in developing countries in tangible, measurable ways. Our deepening relationship with the private sector is evidenced by our Private Sector Investment Lab, which is working to address the barriers preventing private sector investment in emerging markets. The Lab’s core group of 15 Chief Executive Officers and Chairs meets regularly, and their advice has informed our work—most notably with the development of the World Bank Group Guarantee Platform.\n\nThe impact and innovations we delivered this year will allow us to move forward with a raised ambition and a great sense of urgency to improve people’s lives. I would like to recognize the remarkable efforts of our staff and Executive Directors, as well as the unwavering support of all our clients and partners. Together, we head into fiscal 2025 with a great sense of optimism—and determination to create better Bank for a better world.\n\n**AJAY BANGA**\n**President of the World Bank Group and Chairman of the Board of Executive Directors**\n\n**DESCRIPTION OF THE IMAGE OR CHART**\nThe image portrays two individuals interacting with a young child, who appears to be engaged with fresh produce, specifically tomatoes. The background suggests a natural or rural setting, with elements of vegetation visible. The scene reflects themes of agriculture, nurturing, and community interaction."


In [5]:
# Filter and print rows where pageNumber is 21
filtered_rows = df[df['PageNumber'] == 21]
for text in filtered_rows.PageText:
    print(text)

**TRANSCRIPTION OF THE TEXT:**

We also committed $35 million in grants to support emergency relief in Gaza. Working with the World Food Programme, the World Health Organization, and the UN Children’s Fund, the grants support the delivery of emergency food, water, and medical supplies. In the West Bank, we approved a $30 million grant for the continuation of education for children, $22 million to support municipal services, and $45 million to strengthen healthcare and hospital services.

Enabling green and inclusive growth
To help policymakers in the region advance their climate change and development goals, we published Country Climate and Development Reports for the West Bank and Gaza, Lebanon, and Tunisia. In Libya, the catastrophic flooding in September 2023 devastated eastern localities, particularly the city of Derna. The World Bank, together with the UN and the European Union, produced a Rapid Damage and Needs Assessment to inform recovery and reconstruction efforts.

We signed 

### Generating Embeddings

In [6]:
# Add a column to flag pages with visual content
df['Visual_Input_Processed'] = df['PageText'].apply(
    lambda x: 'Y' if 'DESCRIPTION OF THE IMAGE OR CHART' in x or 'TRANSCRIPTION OF THE TABLE' in x else 'N'
)


# Function to get embeddings
def get_embedding(text_input):
    response = oai_client.embeddings.create(
        input=text_input,
        model="text-embedding-3-large"
    )
    return response.data[0].embedding


# Generate embeddings with a progress bar
embeddings = []
for text in tqdm(df['PageText'], desc='Generating Embeddings'):
    embedding = get_embedding(text)
    embeddings.append(embedding)

# Add the embeddings to the DataFrame
df['Embeddings'] = embeddings

Generating Embeddings: 100%|██████████| 49/49 [00:09<00:00,  4.95it/s]


In [7]:
# Display the flag for page 21 
filtered_rows = df[df['PageNumber'] == 21]
print(filtered_rows.Visual_Input_Processed)

20    Y
Name: Visual_Input_Processed, dtype: object


### Uploading embeddings to Pinecone

In [10]:
import os
from pinecone.grpc import PineconeGRPC as Pinecone

from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")
index_name = "my-test-index"

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key)

# reload the index from Pinecone 
index = pc.Index(index_name)

# Create a document ID prefix 
document_id = 'WB_Report'


# Define the async function correctly
def upsert_vector(identifier, embedding, metadata):
    try:
        index.upsert([
            {
                'id': identifier,
                'values': embedding,
                'metadata': metadata
            }
        ])
    except Exception as e:
        print(f"Error upserting vector with ID {identifier}: {e}")
        raise


for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc='Uploading to Pinecone'):
    pageNumber = row['PageNumber']

    # Create meta-data tags to be added to Pinecone 
    metadata = {
        'pageId': f"{document_id}-{pageNumber}",
        'pageNumber': pageNumber,
        'text': row['PageText'],
        'ImagePath': row['ImagePath'],
        'GraphicIncluded': row['Visual_Input_Processed']
    }

    upsert_vector(metadata['pageId'], row['Embeddings'], metadata)

Uploading to Pinecone: 100%|██████████| 49/49 [00:09<00:00,  5.10it/s]


### Performing Semantic Search for Relevant Pages

In [11]:
import json

# Function to get response to a user's question 
def get_response_to_question(user_question, pc_index):
    # Get embedding of the question to find the relevant page with the information 
    question_embedding = get_embedding(user_question)

    # get response vector embeddings 
    response = pc_index.query(
        vector=question_embedding,
        top_k=2,
        include_values=True,
        include_metadata=True
    )

    # Collect the metadata from the matches
    context_metadata = [match['metadata'] for match in response['matches']]

    # Convert the list of metadata dictionaries to prompt a JSON string
    context_json = json.dumps(context_metadata, indent=3)

    prompt = f"""You are a helpful assistant. Use the following context and images to answer the question. In the answer, include the reference to the document, and page number you found the information on between <source></source> tags. If you don't find the information, you can say "I couldn't find the information"

    question: {user_question}
    
    <SOURCES>
    {context_json}
    </SOURCES>
    """

    # Call completions end point with the prompt 
    completion = oai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [12]:
question = "What percentage was allocated to social protections in Western and Central Africa?"
answer = get_response_to_question(question, index)

print(answer)

The percentage allocated to social protections in Western and Central Africa is 8% as shown in the pie chart on page 13.

<source>WB_Report-13</source>


In [13]:
question = "What was the increase in access to electricity between 2000 and 2012 in Western and Central Africa?"
answer = get_response_to_question(question, index)

print(answer)

The increase in access to electricity in Western and Central Africa between 2000 and 2012 was 10%, from 34.1% in 2000 to 44.1% in 2012.

<source>WB_Report-13, page 13</source>


### Handling Pages with Visual Content

In [14]:
import base64
import json


def get_response_to_question_with_images(user_question, pc_index):
    # Get embedding of the question to find the relevant page with the information 
    question_embedding = get_embedding(user_question)

    # Get response vector embeddings 
    response = pc_index.query(
        vector=question_embedding,
        top_k=3,
        include_values=True,
        include_metadata=True
    )

    # Collect the metadata from the matches
    context_metadata = [match['metadata'] for match in response['matches']]

    # Build the message content
    message_content = []

    # Add the initial prompt
    initial_prompt = f"""You are a helpful assistant. Use the text and images provided by the user to answer the question. You must include the reference to the page number or title of the section you the answer where you found the information. If you don't find the information, you can say "I couldn't find the information"

    question: {user_question}
    """
    
    message_content.append({"role": "system", "content": initial_prompt})
    
    context_messages = []

    # Process each metadata item to include text or images based on 'Visual_Input_Processed'
    for metadata in context_metadata:
        visual_flag = metadata.get('GraphicIncluded')
        page_number = metadata.get('pageNumber')
        page_text = metadata.get('text')
        message =""

        if visual_flag =='Y':
            # Include the image
            print(f"Adding page number {page_number} as an image to context")
            image_path = metadata.get('ImagePath', None)
            try:
                base64_image = encode_image(image_path)
                image_type = 'jpeg'
                # Prepare the messages for the API call
                context_messages.append({
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/{image_type};base64,{base64_image}"
                    },
                })
            except Exception as e:
                print(f"Error encoding image at {image_path}: {e}")
        else:
            # Include the text
            print(f"Adding page number {page_number} as text to context")
            context_messages.append({
                    "type": "text",
                    "text": f"Page {page_number} - {page_text}",
                })
        
                # Prepare the messages for the API call
        messages =  {
                "role": "user",
                "content": context_messages
        }
    
    message_content.append(messages)

    completion = oai_client.chat.completions.create(
    model="gpt-4o",
    messages=message_content
    )

    return completion.choices[0].message.content

In [15]:
question = "What percentage was allocated to social protections in Western and Central Africa?"
answer = get_response_to_question_with_images(question, index)

print(answer)

Adding page number 13.0 as an image to context
Adding page number 12.0 as an image to context
Adding page number 11.0 as an image to context
The percentage allocated to social protections in Western and Central Africa is 8%. This information can be found in Figure 2 on page 22 of the document.


In [16]:
question = "Can you find the image associated with digital improvements and describe what you see in the images?"
answer = get_response_to_question_with_images(question, index)

print(answer)

Adding page number 32.0 as text to context
Adding page number 30.0 as an image to context
Adding page number 34.0 as an image to context
In the image associated with digital improvements on page 32 of the document, there is a person seated at a desk using a laptop and holding a smartphone. The desk has various electronic devices and cables. The setting appears to be in a workspace with other individuals visible in the background, indicating a collaborative work environment centered around digital engagement.
