In [1]:
!pip install ezdxf pdfplumber pytesseract groq pandas pdf2image


Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip uninstall pymupdf fitz -y




In [3]:
!pip install pymupdf


Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Using cached pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [4]:
import os
import re
import time
import csv
import pandas as pd
import fitz                   # PyMuPDF for text & vector extraction
import pdfplumber             # For table extraction
import pytesseract            # For OCR extraction from images
from pdf2image import convert_from_path
import ezdxf                 # For DXF generation
from groq import Groq        # For Groq query language

In [5]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Path to Tesseract OCR executable
GROQ_API_KEY =# Replace with your actual API key
client = Groq(api_key=GROQ_API_KEY)
PDF_FILE =r"D:\B23-0075_250105 (1) (1).pdf"  # Change this to your PDF file
EXTRACTED_FOLDER = "extracted_data"

os.makedirs(EXTRACTED_FOLDER, exist_ok=True)

In [None]:

# ✅ Extract Text from PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text")
            if text:
                text = re.sub(r'(?i)(contractor|permit|code compliance|construction notes|schedule).*', '', text)
                extracted_text.append(text.strip())
    return "\n".join(extracted_text)

# ✅ Extract OCR Text from Scanned PDFs
def extract_ocr_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    extracted_text = []
    for img in images:
        text = pytesseract.image_to_string(img)
        extracted_text.append(text.strip())
    return "\n".join(extracted_text)

# ✅ Extract Tables from PDF and Convert to Text
def extract_tables_from_pdf(pdf_path):
    table_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()
            for table in tables:
                df = pd.DataFrame(table).dropna(how="all")  # Remove empty rows
                if not df.empty:
                    table_str = df.to_csv(index=False, header=False)  # Convert table to CSV-style text
                    table_text.append(table_str)
    return "\n".join(table_text)

# ✅ Load Extracted Data
text_data = extract_text_from_pdf(PDF_FILE)
ocr_data = extract_ocr_from_pdf(PDF_FILE)
table_data = extract_tables_from_pdf(PDF_FILE)

# ✅ Merge Extracted Data Before Sending to Groq
combined_text = f"""
EXTRACTED TEXT:
{text_data}

OCR TEXT:
{ocr_data}

TABLE DATA (Converted to Text):
{table_data}
"""

# ✅ Save Extracted Data
with open(os.path.join(EXTRACTED_FOLDER, "merged_text.txt"), "w", encoding="utf-8") as f:
    f.write(combined_text)

print("Extracted Text, OCR, and Tables Merged & Saved.")

In [None]:
def split_text(text, max_tokens=2000):
    """
    Splits the text into smaller chunks to fit within Groq's token limits.
    Ensures we don’t exceed the model's 6000 token per minute limit.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_chunk.append(word)
        current_length += len(word)

        if current_length >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def classify_rooms_with_groq(text_chunk):
    """
    Uses Groq's LLaMA 3 API to extract structured room names, areas, and materials.
    Returns structured CSV output.
    """
    response = client.chat.completions.create(
        model="Llama3-8b-8192",
        messages=[
            {"role": "system", "content": "Extract structured room-wise details from text."
             "Act like a senior architectural data extraction and construction estimation expert with over 15 years of experience analyzing PDF and CAD-based floor plans for contractors, civil engineers, and architects. Your task is to extract all construction-related materials from a provided PDF architectural floor plan with complete coverage and accuracy. Materials include, but are not limited to, MST48, HDU2, HDU3, HDU4, HDU5, HDU6, LVL (Laminated Veneer Lumber), IDF (Intermediate Distribution Frame), and Floor Joists. The extracted data must be organized by Floor, then by Room, then by Material, and formatted in strict CSV with the following columns: Floor Level, Room Name, Area (sq ft), Ceiling Height (ft), Material, Material Type, Count, Estimated Quantity, and Unit of Measure. Detect all materials symbolically or visually indicated, including structural, mechanical, hardware, and surface finishes. Classify materials into appropriate types such as Connector, Lumber, Framing, Hardware, or Fixture. For discrete components (e.g., MST, HDU, connectors, joists), count the number of visible instances per room and floor. For surface-based materials (e.g., flooring, drywall, insulation), calculate estimated quantities using area. Use 'each' for unit-based items, 'sq ft' or 'linear ft' for surface and framing elements. If any information is missing, assume the floor is 'Ground Floor', apply a ceiling height of 9 ft for residential or 12 ft for commercial, and estimate area based on standard room sizes. Structure the data clearly without duplication, ensuring each material appears once per location with accurate counts and estimates. Return only the final CSV-formatted output, with no comments or explanations."
            },
            {"role": "user", "content": f"Extract structured room-wise details:\n{text_chunk}"}
    
        ]
    )
    return response.choices[0].message.content.strip()


# 🔹 Split large text into smaller chunks
text_chunks = split_text(text_data, max_tokens=2500)  # Reduce token size for safer API usage

structured_data_list = []

# 🔹 Process each chunk separately
for idx, chunk in enumerate(text_chunks):
    print(f"Processing chunk {idx + 1}/{len(text_chunks)} with Groq...")
    structured_data = classify_rooms_with_groq(chunk)
    structured_data_list.append(structured_data)

# 🔹 Combine all extracted results
structured_data_csv = "\n".join(structured_data_list)

# 🔹 Save extracted data properly as CSV
csv_path = os.path.join(EXTRACTED_FOLDER, "classified_rooms.csv")

with open(csv_path, "w", encoding="utf-8", newline="") as f:
    f.write("Room Name, Floor Level, Area (sq ft), Ceiling Height (ft), Materials, Estimated Quantities\n")  # Add header
    f.write(structured_data_csv)

print(f"Classified room data saved successfully at {csv_path}")
