In [1]:
import PyPDF2
import re
from tqdm import tqdm

In [4]:
def preprocess_text(text):
    # Remove page titles like "ABCB Housing Provisions Standard 2022 Page 22"
    return re.sub(r'ABCB Housing Provisions Standard \d+ Page \d+', '', text)

def extract_sections(text):
    # Preprocess the text
    text = preprocess_text(text)
    
    # Regular expression pattern to match section headers
    # It starts with newline, captures multiple spaces, then section numbers, then multiple spaces, and then the title
    pattern = re.compile(r'\n\s*(\d+(\.\d+)*)\s+([^0-9\n]+)')
    
    matches = [match for match in pattern.finditer(text)]
    sections = []

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i+1].start() if i+1 < len(matches) else None
        sections.append(text[start:end].strip())

    return sections

text = """
1 Introduction
Some introduction text here.
ABCB Housing Provisions Standard 2022 Page 21

   2.1 Section Title
Content for section 2.1.

   2.1.1   Sub-section Title
Content for sub-section 2.1.1.

   2.1.2   Another Sub-section Title
Content for sub-section 2.1.2.

 2.2 New Section
Content for section 2.2.
ABCB Housing Provisions Standard 2022 Page 22
"""

sections = extract_sections(text)

# Printing sections
for section in sections:
    print(section)
    print("-" * 50)  # Just to separate sections in the output

1 Introduction
Some introduction text here.
--------------------------------------------------
2.1 Section Title
Content for section 2.1.
--------------------------------------------------
2.1.1   Sub-section Title
Content for sub-section 2.1.1.
--------------------------------------------------
2.1.2   Another Sub-section Title
Content for sub-section 2.1.2.
--------------------------------------------------
2.2 New Section
Content for section 2.2.
--------------------------------------------------


In [65]:
def extract_section_details(section_text):
    # Regular expression pattern to match section headers and separate section number from title
    pattern = re.compile(r'^\s*(\d+(\.\d+)*)\s+([^0-9\n]+)')
    
    match = pattern.search(section_text)
    
    if match:
        return {
            "section_number": match.group(1),
            "section_title": match.group(3).strip()
        }
    return None

# Sample extracted section
section = """
   2.1.1   Sub-section Title
Content for sub-section 2.1.1.
"""

details = extract_section_details(section)
print(details)

{'section_number': '2.1.1', 'section_title': 'Sub-section Title'}


In [2]:
url = "https://ncc.abcb.gov.au/system/files/ncc/abcb-housing-provisions-2022-20230501b.pdf"
path = "./data/abcb-housing-provisions-2022-20230501b.pdf"

reader = PyPDF2.PdfReader(path)
book_title = reader.metadata["/Title"]

In [5]:
%%time

page_content = []
for page in tqdm(reader.pages):
    page_text = page.extract_text()
    page_content.append(page.extract_text())

print(f"{len(page_content)} pages has been read.")

100%|██████████| 576/576 [00:19<00:00, 30.23it/s]

576 pages has been read.





AttributeError: 'list' object has no attribute 'join'

In [9]:
with open("./data/abcb-housing-provisions-2022-20230501b.txt", "+w") as f:
    f.write("\n".join(page_content))

In [63]:
reader.metadata

{'/Author': 'Australian Building Codes Board',
 '/CreationDate': 'D:20230424000135Z',
 '/Creator': 'QuarkXPress(R) 16.42',
 '/ModDate': "D:20230713151638+10'00'",
 '/Producer': 'QuarkXPress(R) 16.42',
 '/Title': 'ABCB Housing Provisions',
 '/XPressPrivate': '%%DocumentProcessColors: Cyan Magenta Yellow Black\n%%EndComments'}

2.2.1   Application of Part 2.2 
[New for 2022]  
Part 2.2  need not be complied with if, for the purposes of H1D2(b) only, the Deemed-to-Satisfy Provisions  of H1D3 to  
H1D11 relating to structural elements are complied with.
--------------------------------------------------
2.2.2   Resistance to actions  
[2019: 3.0.2]  
The resistance of a building or structure must be greater than the most critical action effect resulting from different  
combinations of actions, where—  
the most critical action effect on a building or structure must be determined in accordance with 2.2.3  and the  (a)
general design procedures contained in AS/NZS 1170.0; and  
the resistance of a building or structure is determined in accordance with 2.2.4 . (b)
 
Explanatory Information  
A building or structure must be designed to resist the most critical effect resulting from different combinations of actions,  
taking into consideration—  
the probability of simultaneous occurrence of two or more actions; a

In [64]:
print(page_content[21])

IndexError: list index out of range