In [None]:
#!pip install requests beautifulsoup4

In [16]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import os
output_root_dir='ifc_raw'
def get_substring_from_string(pattern,  text, default=None):
    match = re.search(pattern, text)
    if match:
        return(match.group(1))
    else:
        print("warning Pattern not found, returning default")
        return(default)

def get_soup(url):
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    return(soup)

def get_url_section_item_list(url,soup):
    lst=[]
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and 'lexical' in href and not href.startswith("#"):
            full_url = urljoin(url, href)
            lst.append(full_url)
    return(lst)

def get_output_file_name(section,title, sub_title_pattern=r'\b(Ifc\w+)\b'):
    file_name=get_substring_from_string(sub_title_pattern, title)   
    version=get_substring_from_string("\s(IFC\d+(\.\d+)*)\s", title, default='IFC4.3.2.0').replace(".", "_")
    output_dir=f'{output_root_dir}/{version}/{section}'
    full_file_path=f'{output_dir}/{file_name}'
    return(full_file_path)
def get_file_sections_as_dict(item_soup):
    doc_dct={}
    headers = item_soup.find_all('h2')
    for header in headers:
        normalized_header_text = header_text = re.sub(r'\d+(\.\d+)*\s*', '', header.get_text()).strip().replace(" ", "_").lower()
        content = []
        sibling = header.find_next_sibling()
        while sibling and sibling.name != 'h2' and sibling.name !="aside":
                    content.append(sibling.get_text())
                    sibling = sibling.find_next_sibling()
        # Store the content in the dictionary
        doc_dct[normalized_header_text] = content 
    return(doc_dct)
def write_to_json(data, filename):
    with open(f'{filename}.json', 'w') as f:
        json.dump(data, f)    

In [30]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

root_glossary=[
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b1.html', passed
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b2.html, passed
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b3.html', passes
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b4.html', passed
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b5.html',passed
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b6.html',passed
# 'https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b7.html' passed
  ]

for url in root_glossary:
    print(url)
    soup=get_soup(url=url)
    title = soup.title.get_text()
    print(title)
    section=get_substring_from_string(r"\s-\s([\w+\s]+)\s-\sIFC", title).lower().replace(" ", "_")
    url_section_item_list=get_url_section_item_list(url, soup)
    for item_url in url_section_item_list:
        print(f"{item_url} before get soup")
        item_soup=get_soup(item_url)
        headers = item_soup.find_all('h2')
        title = item_soup.title.get_text()
        sub_title_pattern=r'\b(Ifc\w+)\b'
        if("Pset_" in title):
            sub_title_pattern=r'\b(Pset_\w+)\b'
        if("Qto_" in title):
            sub_title_pattern=r'\b(Qto_\w+)\b'
        if("PEnum_" in title):
            sub_title_pattern=r'\b(PEnum_\w+)\b'
        output_file_name=get_output_file_name(section,title, sub_title_pattern=sub_title_pattern)
        file_dct=get_file_sections_as_dict( item_soup)
        if not os.path.exists(os.path.dirname(output_file_name)):
            os.makedirs(os.path.dirname(output_file_name))
        write_to_json(file_dct,output_file_name)

https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/annex-b7.html
Annex B
(informative)

Alphabetical listings - Property Enumerations - IFC4.3.2.0 Documentation
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AccidentResponse.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AcquisitionMethod.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_ActuatorApplication.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AddedMassCoefficientMethod.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AdditionalProcessing.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AddressType.htm before get soup
https://standards.buildingsmart.org/IFC/RELEASE/IFC4_3/HTML/lexical/PEnum_AddressabilityType.htm before get soup
https://standards.buildingsmart.org/IF

In [27]:
section,title

('property_enumerations',
 '7.8.8.48 PEnum_PowerSupplyMode - IFC4.3.2.0 Documentation')

In [29]:
get_substring_from_string(r'\b(PEnum_\w+)\b', '7.8.8.48 PEnum_PowerSupplyMode - IFC4.3.2.0 Documentation')  

'PEnum_PowerSupplyMode'

In [None]:
import json
with open('/Users/amit/github/ifc_raw/IFC4_3_2_0/entities/IfcMaterialLayer.json') as json_file:
    data = json.load(json_file)
data.keys()

In [None]:
data['semantic_definition']

In [None]:
data['attributes']

In [None]:
data['formal_propositions']

In [None]:
data['formal_representation']