## Extracting relevant information from PDF files using OCR and NLP techniques

In [1]:
import os
import spacy
import re
from dateutil.parser import *
import textract
import json

In [2]:
nlp = spacy.load('en_core_web_md')

In [5]:
import pytesseract

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Check if Tesseract is working
print(pytesseract.get_tesseract_version())


5.5.0.20241111


In [6]:
import pytesseract

# Set the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows
# pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'  # macOS/Linux

# For property papers

In [10]:
import fitz  # PyMuPDF
import re
import json

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text")
    return text

def remove_punctuation(text):
    """Remove punctuation except for periods in abbreviations like 'CTS No.'"""
    return re.sub(r"[^\w\s.]", "", text)

def extract_property_details(text):
    """Extract specific property details from text."""
    details = {}

    # Merge broken sentences by replacing newlines with spaces
    text = re.sub(r"\s*\n\s*", " ", text)

    # Remove extra spaces and punctuation for cleaner text
    text = remove_punctuation(text)

    # =================== FLAT DETAILS EXTRACTION ===================

    # Extract Flat Address
    flat_address_match = re.search(r"All that\s*(.*?)(?=\bhereinafter\b)", text, re.IGNORECASE | re.DOTALL)
    details["Flat Address"] = flat_address_match.group(1).strip() if flat_address_match else "Not Found"

    # Extract Unit Type (Flat/Shop/Bungalow)
    unit_type_match = re.search(r"\b(Flat|Shop|Bungalow)\b", text, re.IGNORECASE)
    details["Unit Type"] = unit_type_match.group(0) if unit_type_match else "Not Found"

    # Extract Unit No.
    unit_no_match = re.search(r"No.\s*(\d+)", text, re.IGNORECASE)
    details["Unit No."] = unit_no_match.group(1) if unit_no_match else "Not Found"

    # Extract Floor (word before "floor")
    floor_match = re.search(r"(\w+)\s+floor", text, re.IGNORECASE)
    details["Floor"] = floor_match.group(1) if floor_match else "Not Found"

    # Extract Building Name
    building_match = re.search(r"called\s+([\w\s-]+?)(?:\s+Ltd|\s+situated)", text, re.IGNORECASE)
    details["Building Name"] = building_match.group(1).strip() if building_match else "Not Found"

    # Extract Wing (before "admeasuring")
    wing_match = re.search(r"Wing\s*(.*?)(?=\badmeasuring\b)", text, re.IGNORECASE | re.DOTALL)
    details["Wing"] = wing_match.group(1).strip() if wing_match else "Not Found"

    # Extract City (after "situated at")
    city_match = re.search(r"situated at\s+(\w+)", text, re.IGNORECASE)
    details["City"] = city_match.group(1).strip() if city_match else "Not Found"

    # Extract Pincode (6-digit number)
    pincode_match = re.search(r"(\d{6})\b", details["Flat Address"])
    details["Pincode"] = pincode_match.group(1) if pincode_match else "Not Found"

    # Extract Suburb
    if details["City"] != "Not Found" and details["Pincode"] != "Not Found":
        suburb_pattern = rf"{re.escape(details['City'])}\s+(.*?)\s+{details['Pincode']}"
        suburb_match = re.search(suburb_pattern, details["Flat Address"], re.IGNORECASE)
        details["Suburb/Locality"] = suburb_match.group(1).strip() if suburb_match else "Not Found"
    else:
        details["Suburb/Locality"] = "Not Found"

    # =================== LAND DETAILS EXTRACTION ===================

    # Extract Land Address
    land_address_match = re.search(r"land bearing\s*(.*?)(?=\bor thereabouts\b)", text, re.IGNORECASE | re.DOTALL)
    land_address = land_address_match.group(1).strip() if land_address_match else "Not Found"
    details["Land Address"] = land_address

    # Define regex pattern for land descriptors and numbers
    descriptor_pattern = r"\b(CTS No\.|Survey No\.|F\.P\. No\.|Plot No\.|Gat No\.)\s*([\w\d\-/]+)"
    matches = re.findall(descriptor_pattern, land_address)

    # Initialize default values
    details["Land Descriptor 1"] = "Not Found"
    details["Plot/Survey/CTS No."] = "Not Found"
    details["Land Descriptor 2"] = "Not Found"
    details["Hissa/Sector No."] = "Not Found"
    details["Village"] = "Not Found"
    details["Taluka"] = "Not Found"
    details["District"] = "Not Found"
    details["Land Area"] = "Not Found"

    if matches:
        details["Land Descriptor 1"], details["Plot/Survey/CTS No."] = matches[0]

        # If there's a second descriptor, assign to Land Descriptor 2
        if len(matches) > 1:
            details["Land Descriptor 2"], _ = matches[1]

        # Extract Hissa/Sector No.
        if details["Land Descriptor 2"] != "Not Found":
            hissa_sector_match = re.search(rf"{re.escape(details['Land Descriptor 2'])}\s*(.*?)\s*of\s*Village", land_address, re.IGNORECASE)
            if hissa_sector_match:
                details["Hissa/Sector No."] = hissa_sector_match.group(1).strip()

    # Extract Village
    village_match = re.search(r"\bVillage\s+(\w+)", land_address, re.IGNORECASE)
    if village_match:
        details["Village"] = village_match.group(1).strip()

    # Extract Taluka
    taluka_match = re.search(r"\bTaluka\s+(\w+)", land_address, re.IGNORECASE)
    if taluka_match:
        details["Taluka"] = taluka_match.group(1).strip()

    # Extract District
    district_match = re.search(r"\bDistrict\s+(.*?)\s*admeasuring", land_address, re.IGNORECASE)
    if district_match:
        details["District"] = district_match.group(1).strip()
        
    # Extract everything after "admeasuring"
    land_area_match = re.search(r"\badmeasuring\s+(.+)", land_address, re.IGNORECASE)
    details["Land Area"] = land_area_match.group(1).strip() if land_area_match else "Not Found"



    return details

def save_details_to_json(details, filename="property_details.json"):
    """Save extracted details to a JSON file."""
    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(details, json_file, indent=4, ensure_ascii=False)

# Example text from PDF
# pdf_path = "sampleprop.pdf"
# text = extract_text_from_pdf(pdf_path)

# # Extract property details
# property_details = extract_property_details(text)

# # Save to JSON
# save_details_to_json(property_details, "property_details.json")

# # Print confirmation message
# print("Property details saved to property_details.json")


sample_pdf = "sampleprop.pdf"
text = extract_text_from_pdf(sample_pdf)

# Print extracted text (Debugging step)
print("Extracted Text:\n", text)

# Extract property details
property_details = extract_property_details(text)

# Print extracted details
print("\nExtracted Property Details:")
for key, value in property_details.items():
    print(f"{key}: {value}")



Extracted Text:
  
 
All that Flat No. 605 on the Fifth floor of building known/'to be known' as Bhim Nagar, Wing -, Rehab 7B 
(Pocket 5), admeasuring approx. 225 Sq. ft. carpet  in Co-op Society called Bhimnagar Co-op Hsg Soc Ltd. 
situated at Andheri, Mumbai Suburban -  400093, hereinafter for brevity sake referred to as the 'Said Property' ; 
situated, lying and being on  land bearing  CTS No. 144 (Pt) , CTS No. 145, 146 F.P. No. 5 of Village Mulgaon, 
Taluka Andheri, District Mumbai Suburban  admeasuring 23248 Sq. mtrs  or thereabouts, hereinafter for brevity 
sake referred to as the 'Said Land'.  
 


Extracted Property Details:
Flat Address: Flat No. 605 on the Fifth floor of building knownto be known as Bhim Nagar Wing  Rehab 7B Pocket 5 admeasuring approx. 225 Sq. ft. carpet  in Coop Society called Bhimnagar Coop Hsg Soc Ltd. situated at Andheri Mumbai Suburban   400093
Unit Type: Flat
Unit No.: 605
Floor: Fifth
Building Name: Bhimnagar Coop Hsg Soc
Wing: Rehab 7B Pocket 5
City