In [21]:
import os
import pandas as pd
from typing import List
from pydantic import BaseModel, Field
from dotenv import load_dotenv, find_dotenv

from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
from PIL import Image

import gradio as gr

from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain.prompts import ChatPromptTemplate

# === Load .env key ===
_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# === Schema ===
class Delegation(BaseModel):
    country: str = Field(description="Country name, like 'Ethiopia'")
    year: int = Field(description="Year of the UNGA session")
    officials: int = Field(description="Number of officials listed before 'Representatives'")
    leader_present: int = Field(description="1 if President or Prime Minister is listed among officials, otherwise 0")
    representatives: int = Field(description="Number of representatives")
    alternate_representatives: int = Field(description="Number of alternate representatives")
    advisers: int = Field(description="Number of advisers")
    attendees: int = Field(description="Total number of people across all above categories")

class DelegationInfo(BaseModel):
    entries: List[Delegation]

# === Prompt ===
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are an assistant that extracts delegation data from UN General Assembly reports."),
    ("human", """
From the following text, extract the following for each country:
- Country
- Year
- Number of officials (before 'Representatives')
- Leader present (1 if President/Prime Minister is listed, 0 otherwise)
- Representatives
- Alternate representatives
- Advisers
- Total attendees

Text: {policy}
""")
])

# === LangChain Model ===
model = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=OPENAI_API_KEY)
functions = [convert_to_openai_function(DelegationInfo)]
extraction_model = model.bind(functions=functions, function_call={"name": "DelegationInfo"})
extraction_chain = prompt_template | extraction_model | JsonKeyOutputFunctionsParser(key_name="entries")

# === Text Extraction with OCR Fallback ===
def extract_text_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        raw_text = "".join(page.extract_text() or "" for page in reader.pages)

        if not raw_text.strip():
            print("❗ No text from PdfReader. Trying OCR...")
            images = convert_from_path(file_path)
            raw_text = ""
            for img in images:
                text = pytesseract.image_to_string(img)
                raw_text += text

        return raw_text
    except Exception as e:
        return f"Error reading PDF: {e}"

# === Main Gradio Function ===
def process_delegation_pdf(pdf_file):
    if pdf_file is None:
        return "❌ Please upload a file.", None, pd.DataFrame()
    
    policy_text = extract_text_from_pdf(pdf_file.name)
    if not policy_text.strip():
        return "❌ No text extracted from PDF.", None, pd.DataFrame()

    entries = extraction_chain.invoke({"policy": policy_text})
    df = pd.DataFrame(entries)

    os.makedirs("outputs", exist_ok=True)
    output_path = os.path.join("outputs", "delegation_data.xlsx")
    df.to_excel(output_path, index=False)

    return f"✅ Extracted {len(df)} entries", output_path, df

# === Gradio UI ===
iface = gr.Interface(
    fn=process_delegation_pdf,
    inputs=gr.File(label="📄 Upload UNGA Report PDF"),
    outputs=[
        gr.Textbox(label="Summary"),
        gr.File(label="⬇️ Excel File Output"),
        gr.DataFrame(label="📊 Extracted Table")
    ],
    title="UN Delegation PDF Extractor",
    description="Upload a UNGA report PDF to extract delegation information and download an Excel file."
)

if __name__ == "__main__":
    iface.launch()


* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


❗ No text from PdfReader. Trying OCR...


Traceback (most recent call last):
  File "/Users/CS/Documents/DataExtraction/.venv/lib/python3.12/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/CS/Documents/DataExtraction/.venv/lib/python3.12/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/CS/Documents/DataExtraction/.venv/lib/python3.12/site-packages/gradio/blocks.py", line 2220, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/CS/Documents/DataExtraction/.venv/lib/python3.12/site-packages/gradio/blocks.py", line 1731, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/CS/Documents/DataExtraction/.venv/l

In [18]:
uvicorn delegation_api:app --reload


SyntaxError: invalid syntax (564329780.py, line 1)