In [1]:
import PyPDF2
from tqdm import tqdm
import re


In [10]:
import re

def preprocess_text(text):
    # Remove page titles like "ABCB Housing Provisions Standard 2022 Page 22"
    return re.sub(r'ABCB Housing Provisions Standard \d+ Page \d+', '', text)

def extract_sections(text):
    # Preprocess the text
    text = preprocess_text(text)
    
    # Regular expression pattern to match section headers
    pattern = re.compile(r'\n\s*(\d+(\.\d+)*)\s+([^0-9\n]+)')
    
    matches = [match for match in pattern.finditer(text)]
    
    sections = []
    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else None
        
        section_number = match.group(1)
        section_title = match.group(3).strip()
        section_text = text[start:end].strip()
        
        sections.append({
            "section_number": section_number,
            "section_title": section_title,
            "section_text": section_text
        })

    return sections

text = """
1 Introduction
Some introduction text here.
ABCB Housing Provisions Standard 2022 Page 21

   2.1 Section Title
Content for section 2.1.

   2.1.1   Sub-section Title
Content for sub-section 2.1.1.

   2.1.2   Another Sub-section Title
Content for sub-section 2.1.2.

 2.2 New Section
Content for section 2.2.
ABCB Housing Provisions Standard 2022 Page 22
"""

sections = extract_sections(text)

# Printing sections
for section in sections:
    print(section)
    print("-" * 50)  # Just to separate sections in the output


{'section_number': '1', 'section_title': 'Introduction', 'section_text': 'Some introduction text here.'}
--------------------------------------------------
{'section_number': '2.1', 'section_title': 'Section Title', 'section_text': 'Content for section 2.1.'}
--------------------------------------------------
{'section_number': '2.1.1', 'section_title': 'Sub-section Title', 'section_text': 'Content for sub-section 2.1.1.'}
--------------------------------------------------
{'section_number': '2.1.2', 'section_title': 'Another Sub-section Title', 'section_text': 'Content for sub-section 2.1.2.'}
--------------------------------------------------
{'section_number': '2.2', 'section_title': 'New Section', 'section_text': 'Content for section 2.2.'}
--------------------------------------------------


In [3]:
url = "https://ncc.abcb.gov.au/system/files/ncc/abcb-housing-provisions-2022-20230501b.pdf"
path = "./data/abcb-housing-provisions-2022-20230501b.pdf"


In [5]:
# read the pdf and split them into regulations
def read_pdf(path):
    reader = PyPDF2.PdfReader(path)
    book_title = reader.metadata["/Title"]
    page_contents = []
    page_number = 0
    for page in tqdm(reader.pages):
        page_text = page.extract_text()
        page_number += 1
        sections = extract_sections(page_text)
        
        for section in sections:
            section["page_number"] = page_number
            section["book_title"] = book_title
            page_contents.append(page.extract_text())

    print(f"{len(page_content)} pages has been read.")

    return page_contents

In [9]:
%pip install PyCryptodome 
#for p in tqdm.tqdm(page_content):
#    print(extract_page_number(p))

#print(page_content[19])

Collecting PyCryptodome
  Obtaining dependency information for PyCryptodome from https://files.pythonhosted.org/packages/3f/a1/72ad8fbeb2630e74fa89622ab24ec0ce46dc93dc172156f1a112eb76c014/pycryptodome-3.18.0-cp35-abi3-win_amd64.whl.metadata
  Downloading pycryptodome-3.18.0-cp35-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.18.0-cp35-abi3-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.7 MB 2.3 MB/s eta 0:00:01
   ------ --------------------------------- 0.3/1.7 MB 3.5 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 6.4 MB/s eta 0:00:01
   -------------------------------- ------- 1.4/1.7 MB 8.2 MB/s eta 0:00:01
   ---------------------------------------  1.7/1.7 MB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 7.9 MB/s eta 0:00:00
Installing collected packages: PyCryptodome
Successfully installed PyCryptodome-3.18.0
Note: 

In [21]:
reader.metadata

{'/Author': 'Australian Building Codes Board',
 '/CreationDate': 'D:20230424000135Z',
 '/Creator': 'QuarkXPress(R) 16.42',
 '/ModDate': "D:20230713151638+10'00'",
 '/Producer': 'QuarkXPress(R) 16.42',
 '/Title': 'ABCB Housing Provisions',
 '/XPressPrivate': '%%DocumentProcessColors: Cyan Magenta Yellow Black\n%%EndComments'}

2.2.1   Application of Part 2.2 
[New for 2022]  
Part 2.2  need not be complied with if, for the purposes of H1D2(b) only, the Deemed-to-Satisfy Provisions  of H1D3 to  
H1D11 relating to structural elements are complied with.
--------------------------------------------------
2.2.2   Resistance to actions  
[2019: 3.0.2]  
The resistance of a building or structure must be greater than the most critical action effect resulting from different  
combinations of actions, where—  
the most critical action effect on a building or structure must be determined in accordance with 2.2.3  and the  (a)
general design procedures contained in AS/NZS 1170.0; and  
the resistance of a building or structure is determined in accordance with 2.2.4 . (b)
 
Explanatory Information  
A building or structure must be designed to resist the most critical effect resulting from different combinations of actions,  
taking into consideration—  
the probability of simultaneous occurrence of two or more actions; a

In [50]:
print(page_content[21])

Structure
ABCB Housing Provisions Standard 2022 Page 22  
 
 2.2.1   Application of Part 2.2 
[New for 2022]  
Part 2.2  need not be complied with if, for the purposes of H1D2(b) only, the Deemed-to-Satisfy Provisions  of H1D3 to  
H1D11 relating to structural elements are complied with.  
 2.2.2   Resistance to actions  
[2019: 3.0.2]  
The resistance of a building or structure must be greater than the most critical action effect resulting from different  
combinations of actions, where—  
the most critical action effect on a building or structure must be determined in accordance with 2.2.3  and the  (a)
general design procedures contained in AS/NZS 1170.0; and  
the resistance of a building or structure is determined in accordance with 2.2.4 . (b)
 
Explanatory Information  
A building or structure must be designed to resist the most critical effect resulting from different combinations of actions,  
taking into consideration—  
the probability of simultaneous occurrence of two or mo

In [9]:
import re

def preprocess_text(text):
    # Remove page titles like "ABCB Housing Provisions Standard 2022 Page 22"
    return re.sub(r'ABCB Housing Provisions Standard \d+ Page \d+', '', text)

def extract_sections(text):
    # Preprocess the text
    text = preprocess_text(text)
    
    # Regular expression pattern to match section headers
    pattern = re.compile(r'\n\s*(\d+(\.\d+)*)\s+([^0-9\n]+)')
    
    matches = [match for match in pattern.finditer(text)]
    
    sections = []
    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else None
        
        section_number = match.group(1)
        section_title = match.group(3).strip()
        section_text = text[start:end].strip()
        
        sections.append({
            "section_number": section_number,
            "section_title": section_title,
            "section_text": section_text
        })

    return sections

text = """
1 Introduction
Some introduction text here.
ABCB Housing Provisions Standard 2022 Page 21

   2.1 Section Title
Content for section 2.1.

   2.1.1   Sub-section Title
Content for sub-section 2.1.1.

   2.1.2   Another Sub-section Title
Content for sub-section 2.1.2.

 2.2 New Section
Content for section 2.2.
ABCB Housing Provisions Standard 2022 Page 22
"""

sections = extract_sections(text)

# Printing sections
for section in sections:
    print(section)
    print("-" * 50)  # Just to separate sections in the output


{'section_number': '1', 'section_title': 'Introduction', 'section_text': 'Some introduction text here.'}
--------------------------------------------------
{'section_number': '2.1', 'section_title': 'Section Title', 'section_text': 'Content for section 2.1.'}
--------------------------------------------------
{'section_number': '2.1.1', 'section_title': 'Sub-section Title', 'section_text': 'Content for sub-section 2.1.1.'}
--------------------------------------------------
{'section_number': '2.1.2', 'section_title': 'Another Sub-section Title', 'section_text': 'Content for sub-section 2.1.2.'}
--------------------------------------------------
{'section_number': '2.2', 'section_title': 'New Section', 'section_text': 'Content for section 2.2.'}
--------------------------------------------------
