In [17]:
from dotenv import load_dotenv
import os
import json
from PyPDF2 import PdfReader
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize the ChatOpenAI model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.5,
    openai_api_key=OPENAI_API_KEY
)


RESPONSE_JSON = [
  {
      "bid_number": "",
      "title": "",
      "due_date": "",
      "bid_submission_type": "",
      "term_of_bid": "",
      "pre_bid_meeting": "",
      "installation": "",
      "bid_bond_requirement": "",
      "delivery_date": "",
      "payment_terms": "",
      "additional_documentation": "",
      "mfg_registration": "",
      "contract_cooperative": "",
      "model_no": "",
      "part_no": "",
      "product": "",
      "contact_info": "",
      "company_name": "",
      "bid_summary": "",
      "product_specification": ""
  }
]


TEMPLATE1 = """
Text: {text}

You are an expert answer extractor. Analyze the text provided and extract information for the following fields based on the instructions given:

- bid_number: Extract if present. It may appear as 'Bid Number:', 'RFP #:', or 'Reference Number:'.
- title: The document's title or heading. It typically appears at the top of the document or as a heading like 'Request for Proposal', 'Addendum', or 'Contract Affidavit'.
- due_date: Look for dates mentioned as deadlines. It might be labeled as 'Due Date', 'Submission Deadline', or 'Proposal Submission Date'. Dates may appear in formats like 'July 9, 2024', '09/07/2024', or '2:00 PM CST'.
- bid_submission_type: Specify if it's online or offline. Look for phrases like 'Submit your proposal to', 'Submission Method', or 'Bid Submission Type'.
- term_of_bid: Duration or term of the bid, labeled as 'Contract Term', 'Bid Term', or 'Agreement Duration'. It may be expressed in months or years.
- pre_bid_meeting: Date and details of any pre-bid meeting, labeled as 'Pre-Bid Conference' or 'Pre-Proposal Meeting'.
- installation: Specify if installation is required. Look for phrases like 'Installation Required', 'On-Site Setup', or 'Product Delivery and Installation'.
- bid_bond_requirement: Mention if a bid bond is required. It may appear as 'Bid Security', 'Performance Bond', or 'Surety Bond'.
- delivery_date: Date of expected delivery, labeled as 'Delivery Date', 'Expected Delivery', or 'Product Arrival Date'.
- payment_terms: Mention the payment terms specified. Look for phrases like 'Payment Terms:', 'Net 30 Days', or 'Payment Due'.
- additional_documentation: Mention if additional documentation is required. Look for phrases like 'Supporting Documents', 'Attachments', or 'Required Certificates'.
- mfg_registration: Manufacturer registration details, labeled as 'MFG Registration', 'Vendor Registration', or 'Supplier Registration'.
- contract_cooperative: Details of any contract cooperative mentioned. Look for phrases like 'Cooperative Agreement', 'Joint Contract', or 'Partnered Bid'.
- model_no: Mention model numbers if specified. Look for 'Model No.', 'Part No.', or 'SKU'.
- part_no: Mention part numbers if specified. Similar to model numbers.
- product: Describe the product or service in the RFP. It may be labeled as 'Product:', 'Service:', or 'Item Description:'.
- contact_info: Extract contact information, including phone numbers, email addresses, or physical addresses. Look for sections labeled 'Contact Information', 'Company Address', or 'Email'.
- company_name: Mention the company name mentioned in the RFP. It may be labeled as 'Company Name:', 'Vendor:', or 'Business Entity:'.
- bid_summary: Provide a brief summary of the bid's purpose or objectives. Look for introductory sections or summaries.
- product_specification: Mention product specifications if listed, including details like dimensions, features, or compatibility requirements.

Populate the `response_json` accurately, ensuring all extracted information is contextually correct. Return the `response_json` in the following format:
Output should be JSON in {response_json} format. Enclose every key and value in double quotes.

"""

TEMPLATE2 = """
Previous_json: {previous_text}

Current_Text: {text}

You are an expert contextual answer extractor. Use both the `Previous_json` and `Current_Text` to extract and consolidate information for the following fields based on the instructions given:

- bid_number: Extract if present. It may appear as 'Bid Number:', 'RFP #:', or 'Reference Number:'.
- title: The document's title or heading. It typically appears at the top of the document or as a heading like 'Request for Proposal', 'Addendum', or 'Contract Affidavit'.
- due_date: Look for dates mentioned as deadlines. It might be labeled as 'Due Date', 'Submission Deadline', or 'Proposal Submission Date'. Dates may appear in formats like 'July 9, 2024', '09/07/2024', or '2:00 PM CST'.
- bid_submission_type: Specify if it's online or offline. Look for phrases like 'Submit your proposal to', 'Submission Method', or 'Bid Submission Type'.
- term_of_bid: Duration or term of the bid, labeled as 'Contract Term', 'Bid Term', or 'Agreement Duration'. It may be expressed in months or years.
- pre_bid_meeting: Date and details of any pre-bid meeting, labeled as 'Pre-Bid Conference' or 'Pre-Proposal Meeting'.
- installation: Specify if installation is required. Look for phrases like 'Installation Required', 'On-Site Setup', or 'Product Delivery and Installation'.
- bid_bond_requirement: Mention if a bid bond is required. It may appear as 'Bid Security', 'Performance Bond', or 'Surety Bond'.
- delivery_date: Date of expected delivery, labeled as 'Delivery Date', 'Expected Delivery', or 'Product Arrival Date'.
- payment_terms: Mention the payment terms specified. Look for phrases like 'Payment Terms:', 'Net 30 Days', or 'Payment Due'.
- additional_documentation: Mention if additional documentation is required. Look for phrases like 'Supporting Documents', 'Attachments', or 'Required Certificates'.
- mfg_registration: Manufacturer registration details, labeled as 'MFG Registration', 'Vendor Registration', or 'Supplier Registration'.
- contract_cooperative: Details of any contract cooperative mentioned. Look for phrases like 'Cooperative Agreement', 'Joint Contract', or 'Partnered Bid'.
- model_no: Mention model numbers if specified. Look for 'Model No.', 'Part No.', or 'SKU'.
- part_no: Mention part numbers if specified. Similar to model numbers.
- product: Describe the product or service in the RFP. It may be labeled as 'Product:', 'Service:', or 'Item Description:'.
- contact_info: Extract contact information, including phone numbers, email addresses, or physical addresses. Look for sections labeled 'Contact Information', 'Company Address', or 'Email'.
- company_name: Mention the company name mentioned in the RFP. It may be labeled as 'Company Name:', 'Vendor:', or 'Business Entity:'.
- bid_summary: Provide a brief summary of the bid's purpose or objectives. Look for introductory sections or summaries.
- product_specification: Mention product specifications if listed, including details like dimensions, features, or compatibility requirements.

Analyze and update the `response_json` by merging relevant details from both inputs. Avoid duplication and maintain consistency. Return the updated `response_json` in the following format:
Output should be JSON in {response_json} format. Enclose every key and value in double quotes.
"""


# Define text extraction functions
def get_pdf_text(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_html_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=25000, chunk_overlap=1000)
    return text_splitter.split_text(text)

# Define the LLM chains
prompt1 = PromptTemplate(input_variables=["text", "response_json"], template=TEMPLATE1)
prompt2 = PromptTemplate(input_variables=["text", "previous_text", "response_json"], template=TEMPLATE2)
chain1 = LLMChain(llm=llm, prompt=prompt1, output_key="previous_text", verbose=False)
chain2 = LLMChain(llm=llm, prompt=prompt2, output_key="response_json", verbose=False)

# Process files from both folders
folders = ["Bid1", "Bid2"]
output_folder = "ProcessedBids"
os.makedirs(output_folder, exist_ok=True)

for folder in folders:
    for file_name in os.listdir(folder):
        file_path = os.path.join(folder, file_name)
        if file_name.endswith(".pdf"):
            text = get_pdf_text(file_path)
        elif file_name.endswith(".html"):
            text = get_html_text(file_path)
        else:
            continue
        
        text_chunks = get_text_chunks(text)
        
        # Run the first chain
        res = chain1.run({
            "text": text_chunks[0],
            "response_json": RESPONSE_JSON
        })
        updated_response_json = res
        
        # Run the second chain for subsequent chunks
        for chunk in text_chunks[1:]:
            response = chain2.run({
                "text": chunk,
                "previous_text": updated_response_json,
                "response_json": RESPONSE_JSON
            })
            updated_response_json = response

        # Process the response JSON
        if updated_response_json.startswith("```json"):
            updated_response_json = updated_response_json.split('```json\n')[1][:-4]

        # Save to a JSON file
        response_json = json.loads(updated_response_json)
        output_file_path = os.path.join(output_folder, f"{file_name}.json")
        with open(output_file_path, "w", encoding="utf-8") as file:
            json.dump(response_json, file, indent=2)

        print(f"Processed and saved: {output_file_path}")


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Processed and saved: ProcessedBids\Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf.json


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


JSONDecodeError: Extra data: line 25 column 1 (char 966)