In [1]:
# Install necessary libraries
!pip install transformers langchain pypdf beautifulsoup4 tqdm
!pip install -U langchain-community
!pip install pypdf2



Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.8 (from langchain-community)
  Downloading langchain-0.3.8-py3-none-any.whl.metadata (7.1 kB)
Colle

In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import os

In [3]:
HUGGING_FACE_API_KEY = "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"

# Model and tokenizer loading with GPU support
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HUGGING_FACE_API_KEY)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically maps model layers to GPU
    torch_dtype="auto",  # Automatically selects appropriate precision
    use_auth_token=HUGGING_FACE_API_KEY
)




tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [4]:
# Function to read and extract text from a PDF
def extract_pdf_text(pdf_path):
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return text

# Function to read and extract text from an HTML file
def extract_html_text(html_path):
    text = ""
    try:
        with open(html_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            text = soup.get_text()
    except Exception as e:
        print(f"Error reading HTML {html_path}: {e}")
    return text

# Function to chunk text into 4096 tokens or less
def chunk_text(text, max_tokens=4096):
    tokens = tokenizer.encode(text)
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    text_chunks = [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]
    return text_chunks

# Function to extract information using the Llama model
def extract_information(text_chunks, fields):
    combined_response = ""
    for chunk in text_chunks:
        prompt = f"Extract the following fields from the document:\n{fields}\n\nText:\n{chunk}\n\nStructured Information:"
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=500)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        combined_response += response + "\n"
    return combined_response

# Function to extract specific fields from the model's response
def extract_fields_from_response(response):
    structured_data = {
        "Bid Number": None,
        "Title": None,
        "Due Date": None,
        "Bid Submission Type": None,
        "Term of Bid": None,
        "Pre Bid Meeting": None,
        "Installation": None,
        "Bid Bond Requirement": None,
        "Delivery Date": None,
        "Payment Terms": None,
        "Any Additional Documentation Required": None,
        "MFG for Registration": None,
        "Contract or Cooperative to use": None,
        "Model_no": None,
        "Part_no": None,
        "Product": None,
        "contact_info": None,
        "company_name": None,
        "Bid Summary": None,
        "Product Specification": None
    }
    for field in structured_data.keys():
        search_term = f"{field}:"
        start_index = response.find(search_term)
        if start_index != -1:
            start_index += len(search_term)
            end_index = response.find("\n", start_index)
            value = response[start_index:end_index].strip()
            structured_data[field] = value
    return structured_data

# Function to structure and save the extracted information to a JSON file
def save_to_json(extracted_data, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(extracted_data, json_file, indent=4)
        print(f"Structured data saved to {output_file}")
    except Exception as e:
        print(f"Error saving to JSON: {e}")

# Function to process and extract structured information from documents
def process_document(doc_path, document_type, all_data):
    if document_type == 'pdf':
        text = extract_pdf_text(doc_path)
    elif document_type == 'html':
        text = extract_html_text(doc_path)
    else:
        print("Unsupported document type")
        return None
    text_chunks = chunk_text(text, max_tokens=4096)
    fields = """
    Bid Number
    Title
    Due Date
    Bid Submission Type
    Term of Bid
    Pre Bid Meeting
    Installation
    Bid Bond Requirement
    Delivery Date
    Payment Terms
    Any Additional Documentation Required
    MFG for Registration
    Contract or Cooperative to use
    Model_no
    Part_no
    Product
    contact_info
    company_name
    Bid Summary
    Product Specification
    """
    extracted_info = extract_information(text_chunks, fields)
    structured_data = extract_fields_from_response(extracted_info)
    all_data.append({
        "document": os.path.basename(doc_path),
        "data": structured_data
    })

# Function to process all documents in a folder and combine into one JSON
def process_folder(folder_path, output_file):
    all_data = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith(".pdf"):
            print(f"Processing PDF file: {filename}")
            process_document(file_path, 'pdf', all_data)
        elif filename.endswith(".html"):
            print(f"Processing HTML file: {filename}")
            process_document(file_path, 'html', all_data)
    save_to_json(all_data, output_file)

# Main function to run the extraction for all files in a folder
def main():
    folder_path = "/content/emplay"  # Replace with the actual folder path
    output_file = "output.json"  # File to save the combined structured data
    process_folder(folder_path, output_file)

if __name__ == "__main__":
    main()


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing HTML file: Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing PDF file: PORFP_-_Dell_Laptop_Final.pdf
Processing PDF file: Dell_Laptop_Specs.pdf


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing HTML file: Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing PDF file: Contract_Affidavit.pdf


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing PDF file: Mercury_Affidavit.pdf


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing PDF file: Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf
Processing PDF file: Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Processing PDF file: JA-207652 Student and Staff Computing Devices FINAL.pdf


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured data saved to output.json
