# Document Chat Bot

This notebook demonstrates a multi-document chatbot for querying PDF documents.


## 1. Install Required Packages


In [None]:
%pip install goldmansachs.awm_genai -U
%pip install python-dotenv pandas ipywidgets pdfplumber


## 2. Import Libraries and Configuration


In [None]:
from goldmansachs.awm_genai import LLM, LLMConfig
import os
from typing import List, Dict
import pandas as pd
from datetime import datetime
import tempfile
from IPython.display import display, HTML
import ipywidgets as widgets
import pdfplumber
import json
import re


In [None]:
# Configuration
app_id = "trai"
env = "uat"

# Model Configuration - Choose your model
available_models = ["gemini-2.5-pro", "gemini-2.5-flash-lite"]

# Create model selection widget
model_selector = widgets.Dropdown(
    options=available_models,
    value="gemini-2.5-flash-lite",
    description='Model:',
    style={'description_width': 'initial'}
)

# Display model selector
display(HTML("<h3>Select Model</h3>"))
display(HTML("<p><b>gemini-2.5-pro:</b> More capable, better for complex questions<br><b>gemini-2.5-flash-lite:</b> Faster responses, good for simple queries</p>"))
display(model_selector)

# Store configuration
temperature = 0
log_level = "DEBUG"

print(f"\nApp ID: {app_id}")
print(f"Environment: {env}")


## 3. Initialize LLM


In [None]:
# Initialize LLM with selected model
model_name = model_selector.value

llm_config = LLMConfig(
    app_id=app_id,
    env=env,
    model_name=model_name,
    temperature=temperature,
    log_level=log_level,
)

llm = LLM.init(config=llm_config)
print(f"[SUCCESS] LLM initialized successfully with {model_name}")


## 4. Upload Documents

Use the file upload widget below to select your PDF documents.


In [None]:
# Helper function to extract PDF content with tables and JSON
def extract_pdf_content(file_path: str, filename: str) -> str:
    """Extract text, tables, and JSON from PDF."""
    content_parts = [f"\n\n{'='*80}\nDocument: {filename}\n{'='*80}\n"]
    
    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            content_parts.append(f"\n[Page {page_num}]\n")
            
            # Extract tables on this page
            tables = page.extract_tables()
            
            # Get bounding boxes of tables to exclude from text
            table_bboxes = []
            if tables:
                for table in page.find_tables():
                    table_bboxes.append(table.bbox)
            
            # Extract text excluding table areas
            if table_bboxes:
                text = page.filter(lambda obj: not any(
                    obj.get('x0', 0) >= bbox[0] and obj.get('x1', 0) <= bbox[2] and
                    obj.get('top', 0) >= bbox[1] and obj.get('bottom', 0) <= bbox[3]
                    for bbox in table_bboxes
                )).extract_text()
            else:
                text = page.extract_text()
            
            # Check for JSON/JSONL content in text
            if text and text.strip():
                json_objects = extract_json_content(text)
                
                if json_objects:
                    # Add regular text (non-JSON parts)
                    non_json_text = remove_json_from_text(text)
                    if non_json_text.strip():
                        content_parts.append(f"{non_json_text}\n")
                    
                    # Add formatted JSON objects
                    for json_idx, json_obj in enumerate(json_objects, 1):
                        formatted_json = format_json_object(json_obj, page_num, json_idx, filename)
                        content_parts.append(f"\n{formatted_json}\n")
                else:
                    # No JSON, add as regular text
                    content_parts.append(f"{text}\n")
            
            # Add tables with proper formatting
            if tables:
                for table_idx, table in enumerate(tables, 1):
                    if table and len(table) > 0:
                        formatted_table = format_table(table, page_num, table_idx, filename)
                        content_parts.append(f"\n{formatted_table}\n")
    
    return "\n".join(content_parts)

def extract_json_content(text: str) -> list:
    """Extract JSON or JSONL objects from text."""
    json_objects = []
    
    # Try to find JSON objects
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    matches = re.finditer(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            json_str = match.group(0)
            json_obj = json.loads(json_str)
            json_objects.append(json_obj)
        except json.JSONDecodeError:
            pass
    
    # Also try line-by-line for JSONL format
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('{') and line.endswith('}'):
            try:
                json_obj = json.loads(line)
                if json_obj not in json_objects:
                    json_objects.append(json_obj)
            except:
                pass
    
    return json_objects

def remove_json_from_text(text: str) -> str:
    """Remove JSON objects from text to get only regular text."""
    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    cleaned_text = re.sub(json_pattern, '', text, flags=re.DOTALL)
    return cleaned_text

def format_json_object(json_obj: dict, page_num: int, json_idx: int, filename: str) -> str:
    """Format JSON object for LLM understanding."""
    json_parts = []
    json_parts.append(f"--- JSON OBJECT {json_idx} (Document: {filename}, Page {page_num}) ---")
    
    # Add formatted key-value pairs
    json_parts.append("\nStructured Data Fields:")
    for key, value in json_obj.items():
        # Clean up the value
        if isinstance(value, str):
            value = ' '.join(value.split())
        json_parts.append(f"  {key}: {value}")
    
    # Add JSON format
    json_parts.append("\nJSON Format:")
    json_parts.append(json.dumps(json_obj, indent=2))
    
    json_parts.append(f"--- END JSON OBJECT {json_idx} ---\n")
    
    return "\n".join(json_parts)

def format_table(table: list, page_num: int, table_idx: int, filename: str) -> str:
    """Format table with proper structure."""
    if not table or len(table) == 0:
        return ""
    
    # Clean table data
    cleaned_table = []
    for row in table:
        cleaned_row = [str(cell).strip() if cell is not None else "" for cell in row]
        if any(cleaned_row):
            cleaned_table.append(cleaned_row)
    
    if not cleaned_table:
        return ""
    
    table_parts = []
    table_parts.append(f"--- TABLE {table_idx} (Document: {filename}, Page {page_num}) ---")
    
    # Assume first row is header
    headers = cleaned_table[0]
    data_rows = cleaned_table[1:]
    
    # Add headers
    table_parts.append("\nColumn Headers:")
    table_parts.append(" | ".join(headers))
    table_parts.append("-" * 80)
    
    # Add data rows
    table_parts.append("\nTable Data:")
    for row in data_rows:
        table_parts.append(" | ".join(row))
    
    # Add markdown format for better LLM understanding
    table_parts.append("\nMarkdown Format:")
    table_parts.append("| " + " | ".join(headers) + " |")
    table_parts.append("|" + "|".join(["---" for _ in headers]) + "|")
    for row in data_rows:
        table_parts.append("| " + " | ".join(row) + " |")
    
    table_parts.append(f"--- END TABLE {table_idx} ---\n")
    
    return "\n".join(table_parts)

# Helper functions for JSON extraction
def extract_from_json_file(file_path: str, filename: str) -> str:
    """Extract and format JSON file content."""
    content_parts = [f"\n\n{'='*80}\nDocument: {filename}\n{'='*80}\n"]
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, list):
            content_parts.append("\nThis file contains a list of JSON objects:\n")
            for idx, obj in enumerate(data, 1):
                formatted = format_json_object(obj, 0, idx, filename)
                content_parts.append(f"\n{formatted}\n")
        elif isinstance(data, dict):
            formatted = format_json_object(data, 0, 1, filename)
            content_parts.append(f"\n{formatted}\n")
        else:
            content_parts.append(f"\nJSON Value: {data}\n")
    except Exception as e:
        content_parts.append(f"\n[ERROR] Failed to parse JSON: {str(e)}\n")
    
    return "\n".join(content_parts)

def extract_from_jsonl_file(file_path: str, filename: str) -> str:
    """Extract and format JSONL file content."""
    content_parts = [f"\n\n{'='*80}\nDocument: {filename}\n{'='*80}\n"]
    content_parts.append("\nThis file contains multiple JSON objects (one per line):\n")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f, 1):
                line = line.strip()
                if line:
                    try:
                        obj = json.loads(line)
                        formatted = format_json_object(obj, 0, idx, filename)
                        content_parts.append(f"\n{formatted}\n")
                    except json.JSONDecodeError:
                        content_parts.append(f"\n[Line {idx}] Invalid JSON: {line[:100]}...\n")
    except Exception as e:
        content_parts.append(f"\n[ERROR] Failed to parse JSONL: {str(e)}\n")
    
    return "\n".join(content_parts)

# Create file upload widget
upload_widget = widgets.FileUpload(
    accept='.pdf,.json,.jsonl,.txt',
    multiple=True,
    description='Select Files'
)

# Create process button
process_button = widgets.Button(
    description='Extract Text',
    button_style='primary',
    icon='check'
)

# Create output widget for status messages
output = widgets.Output()

# Store extracted text globally
extracted_text = None
document_names = []

def on_process_button_clicked(b):
    global extracted_text, document_names
    
    with output:
        output.clear_output()
        
        if not upload_widget.value:
            print("[WARNING] Please select PDF files first")
            return
        
        try:
            all_text = []
            document_names = []
            
            # Extract content from uploaded files
            files = upload_widget.value
            print(f"Processing {len(files)} files...\n")
            
            temp_files = []
            document_names = []
            
            # Save uploaded files temporarily
            for file_info in files:
                filename = file_info['name']
                content = file_info['content']
                document_names.append(filename)
                
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                    tmp_file.write(content)
                    temp_files.append((tmp_file.name, filename))
            
            # Extract content based on file type
            for tmp_path, filename in temp_files:
                try:
                    file_ext = filename.split('.')[-1].lower()
                    
                    if file_ext == 'pdf':
                        doc_content = extract_pdf_content(tmp_path, filename)
                    elif file_ext == 'json':
                        doc_content = extract_from_json_file(tmp_path, filename)
                    elif file_ext == 'jsonl':
                        doc_content = extract_from_jsonl_file(tmp_path, filename)
                    elif file_ext == 'txt':
                        # Extract from text file
                        with open(tmp_path, 'r', encoding='utf-8') as f:
                            text_content = f.read()
                        doc_content = f"\n\n{'='*80}\nDocument: {filename}\n{'='*80}\n\n{text_content}\n"
                    else:
                        doc_content = f"\n[ERROR] Unsupported file type: {file_ext}\n"
                    
                    all_text.append(doc_content)
                    print(f"  [OK] {filename} - extracted successfully")
                finally:
                    os.unlink(tmp_path)
            
            # Combine all extracted text
            extracted_text = "\n\n".join(all_text)
            
            print(f"\n[SUCCESS] Successfully extracted text from {len(files)} documents")
            print(f"Total characters: {len(extracted_text):,}")
                
        except Exception as e:
            print(f"[ERROR] Error: {str(e)}")
            import traceback
            traceback.print_exc()

process_button.on_click(on_process_button_clicked)

# Display widgets
display(HTML("<h3>Upload and Extract Documents</h3>"))
display(HTML("<p>Supported formats: PDF, JSON, JSONL, TXT</p>"))
display(upload_widget)
display(process_button)
display(output)

print("Use the widget above to select files (PDF, JSON, JSONL, or TXT) and extract their content")


## 5. Ask Questions


In [None]:
# Check if text is extracted
if extracted_text is None:
    print("[WARNING] Please extract text from PDFs first (see section 4)")
else:
    # Function to ask questions
    def ask_question(question: str) -> str:
        """Ask a question about the documents."""
        # Create prompt with context
        full_prompt = f"""You are a helpful assistant that answers questions based on provided documents.

The documents may contain:
- Regular text content
- Tables with headers and data rows
- Structured JSON objects with field-value pairs
- JSONL data (multiple JSON objects)

Document Content:
{extracted_text}

Question: {question}

Instructions:
1. Provide a clear, concise answer based ONLY on the information in the documents
2. Format your response for readability with bullet points and clear paragraphs
3. When referencing data, cite the source (e.g., "from Table 1 on Page 3" or "JSON Object 2")
4. If information comes from JSON fields, mention the field names
5. If the information is not in the documents, clearly state that
6. Do not include raw JSON dumps - summarize the information

Answer:"""
        
        # Get response
        response = llm.invoke(full_prompt)
        
        # Extract actual content from response
        actual_response = None
        
        # Try different response formats
        if hasattr(response, 'content'):
            # Response object with content attribute
            actual_response = response.content
        elif isinstance(response, dict):
            # Handle SDK response format
            if 'Response' in response and 'content' in response['Response']:
                actual_response = response['Response']['content']
            elif 'content' in response:
                actual_response = response['content']
            else:
                actual_response = str(response)
        else:
            actual_response = str(response)
        
        # Clean up the response - remove escape sequences
        if actual_response:
            actual_response = actual_response.replace('\\n\\n', '\n\n')
            actual_response = actual_response.replace('\\n', '\n')
            actual_response = actual_response.strip()
        
        return actual_response
    
    print("[SUCCESS] Ready to answer questions!")
    print(f"Documents loaded: {', '.join(document_names)}")
    print("\nUse ask_question('your question') to ask questions.")


## 6. Example Question


In [None]:
# Check if everything is ready
if extracted_text is None:
    print("[WARNING] Please extract text from PDFs first (see section 4)")
elif 'ask_question' not in globals():
    print("[WARNING] Please run section 5 first")
else:
    # Ask a question about the documents
    question = "What are the key findings in the documents?"
    
    print(f"Question: {question}\n")
    print("Generating response...\n")
    
    response = ask_question(question)
    
    print("Response:")
    print("-" * 80)
    print(response)
    print("-" * 80)


## 7. Interactive Chat Interface

Chat with your documents interactively.


In [None]:
# Check if everything is ready
if extracted_text is None:
    print("[WARNING] Please extract text from PDFs first (see section 4)")
else:
    # Chat history
    chat_history = []
    
    def chat_with_documents(question: str) -> str:
        """Send a question to the LLM and get a response with chat history."""
        response = ask_question(question)
        
        # Store in chat history
        chat_history.append({
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "question": question,
            "response": response
        })
        
        return response
    
    print("[SUCCESS] Chat interface ready!")
    print("\nUse chat_with_documents('your question') to ask questions.")
    print("\nExample:")
    print("  chat_with_documents('What are the key findings?')")


## 8. Example: Ask Questions


In [None]:
# Example: Ask your first question
if 'chat_with_documents' in globals():
    response = chat_with_documents("What are the main topics in the documents?")
    print(response)
else:
    print("[WARNING] Please complete the setup first")


## 9. View Chat History


In [None]:
# Display chat history as a DataFrame
if 'chat_history' in globals() and chat_history:
    df_history = pd.DataFrame(chat_history)
    display(df_history)
else:
    print("No chat history yet. Start asking questions!")
