In [14]:
pip install ipywidgets==7.7.2


Collecting ipywidgets==7.7.2
  Downloading ipywidgets-7.7.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting widgetsnbextension~=3.6.0 (from ipywidgets==7.7.2)
  Downloading widgetsnbextension-3.6.10-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting jupyterlab-widgets<3,>=1.0.0 (from ipywidgets==7.7.2)
  Downloading jupyterlab_widgets-1.1.11-py3-none-any.whl.metadata (3.7 kB)
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets==7.7.2)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets==7.7.2)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextens

In [17]:
pip install --upgrade transformers



Collecting transformers
  Using cached transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.47.0-py3-none-any.whl (10.1 MB)
Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.31.0
    Uninstalling transformers-4.31.0:
      Successfully uninstalled transformers-4.31.0
Successfully installed tokenizers-0.21.0 transformers-4.47.0


In [6]:
import os
import json
import logging
from typing import List, Dict, Any

# Lightweight parsing libraries
import fitz  # PyMuPDF for PDF
from bs4 import BeautifulSoup

# Low-memory ML libraries
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class LowMemoryRFPExtractor:
    def __init__(self, model_name="google/flan-t5-small"):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name, 
                trust_remote_code=True
            )
            
            # Load the model without quantization
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                device_map='auto',  # Use CPU or available GPU
                torch_dtype=torch.float16,  # Use float16 for better efficiency
                low_cpu_mem_usage=True  # Optimize memory usage for CPU
            )
        except Exception as e:
            self.logger.error(f"Model loading error: {e}")
            raise

    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF with minimal memory usage"""
        try:
            doc = fitz.open(pdf_path)
            text = " ".join([page.get_text() for page in doc])
            doc.close()
            return text[:4000]  # Limit text to first 4000 chars
        except Exception as e:
            self.logger.error(f"PDF extraction error: {e}")
            return ""
    
    def extract_text_from_html(self, html_path: str) -> str:
        """Extract text from HTML with minimal memory usage"""
        try:
            with open(html_path, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                return soup.get_text()[:4000]  # Limit text
        except Exception as e:
            self.logger.error(f"HTML extraction error: {e}")
            return ""
    
    def create_extraction_prompt(self, document_text: str) -> str:
        """Create a concise extraction prompt"""
        return f"""Extract key information from this document:

Document Text:
{document_text[:3000]}  # Further limit input

Extract these fields in JSON:
- Bid_Number
- Title
- Due_Date
- Product
- Company_Name
- Contact_Info
- Key_Specifications

Provide a compact, accurate JSON response."""
    
    def extract_structured_data(self, document_text: str) -> Dict[str, Any]:
        """Extract structured data with memory-efficient generation"""
        try:
            prompt = self.create_extraction_prompt(document_text)
            
            # Tokenize with max length
            inputs = self.tokenizer(
                prompt, 
                return_tensors="pt", 
                max_length=1024, 
                truncation=True
            )
            
            # Generate with controlled parameters
            outputs = self.model.generate(
                **inputs, 
                max_new_tokens=512,
                do_sample=False,  # Greedy decoding
                temperature=0.1
            )
            
            # Decode and extract JSON
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return self._parse_json_from_text(generated_text)
        
        except Exception as e:
            self.logger.error(f"Extraction error: {e}")
            return {}
    
    def _parse_json_from_text(self, text: str) -> Dict[str, Any]:
        """Parse JSON with robust error handling"""
        import re
        import json
        
        try:
            # Extract JSON-like content
            json_match = re.search(r'\{.*\}', text, re.DOTALL)
            if json_match:
                return json.loads(json_match.group(0))
            return {}
        except Exception as e:
            self.logger.warning(f"JSON parsing error: {e}")
            return {}
    
    def process_documents(self, document_paths: List[str]) -> List[Dict]:
        """Process documents with minimal memory footprint"""
        results = []
        
        for doc_path in document_paths:
            # Determine and extract text
            if doc_path.lower().endswith('.pdf'):
                text = self.extract_text_from_pdf(doc_path)
            elif doc_path.lower().endswith('.html'):
                text = self.extract_text_from_html(doc_path)
            else:
                self.logger.warning(f"Unsupported file: {doc_path}")
                continue
            
            # Extract data
            extracted_data = self.extract_structured_data(text)
            
            results.append({
                'source': doc_path,
                'extracted_data': extracted_data
            })
        
        return results

def main():
    """Main execution for low-memory RFP extraction"""
    # Specify document paths
    document_paths = [
        'Dell_Laptop_Specs.pdf',  # Your sample document
        # Add more document paths
    ]
    
    # Create output directory
    os.makedirs('extracted_data', exist_ok=True)
    
    try:
        # Initialize low-memory extractor
        extractor = LowMemoryRFPExtractor()
        
        # Process documents
        results = extractor.process_documents(document_paths)
        
        # Save results
        for idx, result in enumerate(results, 1):
            output_path = f'extracted_data/extraction_{idx}.json'
            with open(output_path, 'w') as f:
                json.dump(result, f, indent=2)
            print(f"Saved extraction to {output_path}")
    
    except Exception as e:
        print(f"Extraction failed: {e}")

if __name__ == "__main__":
    main()


Saved extraction to extracted_data/extraction_1.json


In [21]:
pip install bitsandbytes


Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install tf-keras

Collecting tf-kerasNote: you may need to restart the kernel to use updated packages.

  Using cached tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Using cached tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
Installing collected packages: tf-keras
Successfully installed tf-keras-2.18.0


In [30]:
pip install tensorflow==2.11


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement tensorflow==2.11 (from versions: 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.14.1, 2.15.0rc0, 2.15.0rc1, 2.15.0, 2.15.1, 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0)
ERROR: No matching distribution found for tensorflow==2.11


In [31]:
pip install tensorflow==2.12.0


Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting tensorflow-intel==2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_intel-2.12.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting jax>=0.3.15 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading jax-0.4.37-py3-none-any.whl.metadata (22 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading numpy-1.23.5-cp311-cp311-win_amd64.whl.metadata (2.3 kB)
Collecting tensorboard<2.13,>=2.12 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-any.whl.metad

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.12.0 which is incompatible.


In [32]:
pip install keras==2.11


Collecting keras==2.11
  Using cached keras-2.11.0-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.12.0
    Uninstalling keras-2.12.0:
      Successfully uninstalled keras-2.12.0
Successfully installed keras-2.11.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.11.0 which is incompatible.


In [33]:
pip install keras==2.12.0


Collecting keras==2.12.0
  Using cached keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.11.0
    Uninstalling keras-2.11.0:
      Successfully uninstalled keras-2.11.0
Successfully installed keras-2.12.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import json
from bs4 import BeautifulSoup
import pdfplumber
from transformers import pipeline

# Define structured data fields
FIELDS = [
    "Bid Number", "Title", "Due Date", "Bid Submission Type", "Term of Bid",
    "Pre Bid Meeting", "Installation", "Bid Bond Requirement", "Delivery Date",
    "Payment Terms", "Any Additional Documentation Required", "MFG for Registration",
    "Contract or Cooperative to use", "Model_no", "Part_no", "Product", 
    "contact_info", "company_name", "Bid Summary", "Product Specification", "Value"
]

# Initialize a Language Model pipeline (e.g., OpenAI or Hugging Face models)
nlp = pipeline("question-answering")

# Function to extract text from HTML files
def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    return soup.get_text()

# Function to extract text from PDF files
def extract_text_from_pdf(file_path):
    extracted_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            extracted_text += page.extract_text()
    return extracted_text

# Function to structure extracted information
def structure_information(text):
    structured_data = {}
    for field in FIELDS:
        try:
            # Use NLP to extract relevant information for each field
            response = nlp({
                "context": text,
                "question": f"What is the {field}?"
            })
            structured_data[field] = response.get("answer", "").strip()
        except Exception as e:
            structured_data[field] = f"Error extracting {field}: {str(e)}"
    return structured_data

# Main function to process all documents and save outputs
def process_documents(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Parse based on file type
        if file_name.endswith(".html"):
            text = extract_text_from_html(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        else:
            print(f"Unsupported file type: {file_name}")
            continue

        # Structure information
        structured_data = structure_information(text)

        # Save the output as a JSON file
        output_file = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.json")
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(structured_data, json_file, indent=4, ensure_ascii=False)
        print(f"Processed {file_name}: Output saved to {output_file}")

# Entry point
if __name__ == "__main__":
    input_folder = "C:/Users/deept/OneDrive/Desktop/Campus hiring-2024-2025 assignment/Bid1" # Replace with your folder path
    output_folder = "./output_jsons"  # Folder to save output JSON files

    # Ensure input folder exists
    if not os.path.exists(input_folder):
        print(f"Input folder not found: {input_folder}")
    else:
        process_documents(input_folder, output_folder)
        print(f"Processing complete. All outputs saved in {output_folder}")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Processed Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf: Output saved to ./output_jsons\Addendum 1 RFP JA-207652 Student and Staff Computing Devices.json
Processed Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf: Output saved to ./output_jsons\Addendum 2 RFP JA-207652 Student and Staff Computing Devices.json
Processed JA-207652 Student and Staff Computing Devices FINAL.pdf: Output saved to ./output_jsons\JA-207652 Student and Staff Computing Devices FINAL.json
Processed Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html: Output saved to ./output_jsons\Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.json
Processing complete. All outputs saved in ./output_jsons


In [4]:
import os
import json
from bs4 import BeautifulSoup
import pdfplumber
from transformers import pipeline

# Define structured data fields
FIELDS = [
    "Bid Number", "Title", "Due Date", "Bid Submission Type", "Term of Bid",
    "Pre Bid Meeting", "Installation", "Bid Bond Requirement", "Delivery Date",
    "Payment Terms", "Any Additional Documentation Required", "MFG for Registration",
    "Contract or Cooperative to use", "Model_no", "Part_no", "Product", 
    "contact_info", "company_name", "Bid Summary", "Product Specification", "Value"
]

# Initialize a Language Model pipeline (e.g., OpenAI or Hugging Face models)
nlp = pipeline("question-answering")

# Function to extract text from HTML files
def extract_text_from_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    return soup.get_text()

# Function to extract text from PDF files
def extract_text_from_pdf(file_path):
    extracted_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            extracted_text += page.extract_text()
    return extracted_text

# Function to structure extracted information
def structure_information(text):
    structured_data = {}
    for field in FIELDS:
        try:
            # Use NLP to extract relevant information for each field
            response = nlp({
                "context": text,
                "question": f"What is the {field}?"
            })
            structured_data[field] = response.get("answer", "").strip()
        except Exception as e:
            structured_data[field] = f"Error extracting {field}: {str(e)}"
    return structured_data

# Main function to process all documents and save outputs
def process_documents(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        # Parse based on file type
        if file_name.endswith(".html"):
            text = extract_text_from_html(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        else:
            print(f"Unsupported file type: {file_name}")
            continue

        # Structure information
        structured_data = structure_information(text)

        # Save the output as a JSON file
        output_file = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.json")
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(structured_data, json_file, indent=4, ensure_ascii=False)
        print(f"Processed {file_name}: Output saved to {output_file}")

# Entry point
if __name__ == "__main__":
    input_folder = "C:/Users/deept/OneDrive/Desktop/Campus hiring-2024-2025 assignment/Bid2" # Replace with your folder path
    output_folder = "./output_jsons"  # Folder to save output JSON files

    # Ensure input folder exists
    if not os.path.exists(input_folder):
        print(f"Input folder not found: {input_folder}")
    else:
        process_documents(input_folder, output_folder)
        print(f"Processing complete. All outputs saved in {output_folder}")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Processed Contract_Affidavit.pdf: Output saved to ./output_jsons\Contract_Affidavit.json
Processed Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html: Output saved to ./output_jsons\Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.json
Processed Dell_Laptop_Specs.pdf: Output saved to ./output_jsons\Dell_Laptop_Specs.json
Processed Mercury_Affidavit.pdf: Output saved to ./output_jsons\Mercury_Affidavit.json
Processed PORFP_-_Dell_Laptop_Final.pdf: Output saved to ./output_jsons\PORFP_-_Dell_Laptop_Final.json
Processing complete. All outputs saved in ./output_jsons
