# Data Extraction 

### imports

In [33]:
import pdfplumber
import os
from bs4 import BeautifulSoup
from pathlib import Path
import google.generativeai as genai
import json

### functions to extract text from pds and html

In [42]:
# pdf
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = text + page.extract_text(x_tolerance=2, y_tolerance=2) + "\n"
        return text
    except Exception as e:
        print(e)
        return ""
# html
def extract_text_from_html(html_path):
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'lxml')
            text = soup.get_text(separator='\n', strip=True)
            return text
    except Exception as e:
        print(e)
        return ""

### function to iterate through a folder and grab text from files

In [43]:
def get_all_text_from_folder(folder_path):
    combined_text = ""
    folder_path = Path(folder_path)

    if not folder_path.is_dir():
        print("folder not found")
        return ""

    pdf_files = list(folder_path.glob('*.pdf'))
    html_files = list(folder_path.glob('*.html'))

    if not pdf_files and not html_files:
        print("no PDF or HTML files found")
        return ""

        
    for pdf_file in pdf_files:
        combined_text += f"\n\n{'='*15} START OF DOCUMENT: {pdf_file.name} {'='*15}\n\n"
        pdf_content = extract_text_from_pdf(pdf_file)
        combined_text += pdf_content
        combined_text += f"\n\n{'='*15} END OF DOCUMENT: {pdf_file.name} {'='*15}\n\n"

    for html_file in html_files:
        combined_text += f"\n\n{'='*15} START OF DOCUMENT: {html_file.name} {'='*15}\n\n"
        html_content = extract_text_from_html(html_file) 
        combined_text += html_content
        combined_text += f"\n\n{'='*15} END OF DOCUMENT: {html_file.name} {'='*15}\n\n"

    return combined_text

### configuring LLM api and creating a field list

In [29]:
try:
    api_key = 'your-api-key'
    genai.configure(api_key=api_key)
    print('Gemini Key configured')
except Exception as e:
    print(e)

fields_to_extract = [
    "Bid Number", "Title", "Due Date", "Bid Submission Type", "Term of Bid",
    "Pre Bid Meeting", "Installation", "Bid Bond Requirement", "Delivery Date",
    "Payment Terms", "Any Additional Documentation Required", "MFG for Registration",
    "Contract or Cooperative to use", "Model_no", "Part_no", "Product",
    "contact_info", "company_name", "Bid Summary", "Product Specification"
]

field_list_str = "\n".join([f"- {field}" for field in fields_to_extract])

Gemini Key configured


###  function to implement the prompt and grabbing the response

In [37]:
def get_json_from_llm(context_text, field_list):
    
    prompt = prompt = f"""
    Analyze the following combined text from multiple procurement-related documents (like RFPs, amendments, Q&As, spec sheets, web pages). Your task is to extract specific information into a strict JSON format, synthesizing details accurately and handling potential inconsistencies.

    **Document Context:** The text includes content from different files, marked by 'START OF DOCUMENT: [Filename]' and 'END OF DOCUMENT: [Filename]'. Filenames or content might indicate updates (e.g., 'Addendum', 'Amendment', 'Clarification', 'Q&A', dates).

    **CRITICAL RULES FOR ACCURATE & ROBUST EXTRACTION:**
    1.  **Identify Updates:** Carefully examine all documents. Identify any that serve as updates, corrections, or clarifications to earlier documents. Look for explicit keywords ('Addendum', 'Amendment', 'Update', 'Correction', 'Response to Questions') or infer based on context and dates.
    2.  **Prioritize Latest Information:** If conflicting information exists for *any* field (e.g., different due dates, modified specs, updated contact info), you **MUST** use the information from the document identified as the most recent or superseding update as the final, correct value. Discard the older, conflicting information.
    3.  **Synthesize Comprehensively:** Combine all relevant, non-conflicting details from *all* provided documents to create a complete answer for each field. For example:
        * `Installation`: Gather requirements like 'white glove service' from one document and specific tasks like 'etching' or 'Autopilot setup' from another.
        * `Product Specification`: Consolidate base requirements from the main RFP with detailed specs from separate sheets and any modifications mentioned in updates.
    4.  **Strict JSON Output:** Return **ONLY** a single, valid JSON object. Adhere *exactly* to the requested field names. Do not include ```json markdown, introductory/closing text, apologies, or any explanations outside the JSON structure. Use double quotes for all keys and string values.
    5.  **Handle Missing Data:** If, after reviewing *all* documents, definitive information for a field cannot be found, use the JSON value `null`. **Do not guess, infer, or fabricate information.** If a field seems partially mentioned but isn't clear, prefer `null`.
    6.  **Field-Specific Guidance:**
        * `Bid Number`: Find the primary identifier (e.g., RFP No., PORFP #, Solicitation ID). Look for patterns like `JA-XXXXXX` or `E20PXXXXXXX`.
        * `Due Date`: Extract the *final, effective* deadline considering all updates. Include time and timezone if specified.
        * `Model_no` / `Part_no`: Extract only if the documents explicitly state a *required* Model or Part Number/SKU as the *subject* of the procurement (e.g., "This RFP is for the purchase of Dell Latitude 5550"). If the documents only list *minimum specs* or examples, use `null`.
        * `Product`: Briefly list the main items/services requested. Mention specific required models if identified per the `Model_no` rule.
        * `Product Specification`: Summarize key requirements and specs. Structure clearly, perhaps using nested objects for different product types/tiers. Include details from dedicated spec sheets if present.
        * `Any Additional Documentation Required`: List *specific* forms, affidavits, certifications, or attachments vendors must submit.

    **FIELDS TO EXTRACT:**
    {field_list}

    **COMBINED DOCUMENT TEXT:**
    ```text
    {context_text}
    ```

    **JSON OUTPUT (Strictly JSON, starting with {{ and ending with }}):**
    """

    try:
        model = genai.GenerativeModel('models/gemini-2.5-pro')
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                 response_mime_type="application/json"
            )
        )

        cleaned_text = response.text.strip().removeprefix('```json').removesuffix('```').strip()
        return cleaned_text

    except Exception as e:
        print(e)

### creating json for Bid1 folder

In [38]:
folder_to_process = './data/Bid1'
output_filename = 'Bid1_output.json'


all_text = get_all_text_from_folder(folder_to_process)

extracted_json_string = None
extracted_json_string = get_json_from_llm(all_text, field_list_str)

try:
    json_data = json.loads(extracted_json_string)
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)
        print("File Saved")
        
except Exception as e:
    print(e)

File Saved


### creating json for Bid2 folder

In [41]:
folder_to_process = './data/Bid2'
output_filename = 'Bid2_output.json'


all_text = get_all_text_from_folder(folder_to_process)

extracted_json_string = None
extracted_json_string = get_json_from_llm(all_text, field_list_str)

try:
    json_data = json.loads(extracted_json_string)
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)
        print("File Saved")
        
except Exception as e:
    print(e)

File Saved
