#### Imports & Configuration



In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from pathlib import Path

random.seed(42)
np.random.seed(42)

NUM_ELEMENTS = 950           
START_DATE   = datetime(2020, 1, 1)
END_DATE     = datetime(2025, 12, 31)

ELEMENT_TYPES = [
    "IfcWall", "IfcWallStandardCase",
    "IfcDoor", "IfcDoorStandardCase",
    "IfcWindow", "IfcWindowStandardCase",
    "IfcSlab", "IfcBeam", "IfcColumn",
    "IfcRoof", "IfcCovering", "IfcRailing",
    "IfcFurniture", "IfcSpace", "IfcBuildingStorey"
]

MATERIALS = [
    "Concrete C25/30", "Concrete C30/37", "Reinforced Concrete",
    "Brick Masonry", "AAC Block", "Gypsum Board",
    "Ceramic Tile", "Porcelain Tile", "Marble Slab", "Granite Slab",
    "Wood - Oak", "Wood - Teak", "Steel - Galvanized", "Aluminum",
    "Glass - Clear 6mm", "Glass - Tinted 8mm", "Paint - Emulsion White",
    "Paint - Acrylic Grey", "Epoxy Coating"
]

UNITS = ["m²", "m", "pcs", "m³", "kg", "set", "no"]

DATA_DIR = Path("data/synthetic_bim")
DATA_DIR.mkdir(exist_ok=True, parents=True)

#### Helper Functions

In [2]:
def random_date(start, end):
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))

def random_cost(mean=8.5, sigma=1.3):
    """Lognormal distribution → realistic cost spread"""
    return round(np.random.lognormal(mean=mean, sigma=sigma), 2)

def random_quantity(element_type):
    if "Door" in element_type or "Window" in element_type:
        return random.randint(1, 80)
    elif "Furniture" in element_type:
        return random.randint(1, 200)
    else:
        return random.randint(1, 120)

#### Generate Synthetic Data



In [3]:
data = []

for i in range(NUM_ELEMENTS):
    element_type = random.choice(ELEMENT_TYPES)
    material = random.choice(MATERIALS)
    
    row = {
        "GlobalId": f"2${random.randint(10000000,99999999):08d}${random.randint(1000,9999)}",
        "Name": f"{element_type.split('Ifc')[-1]} {random.randint(1,999):03d}",
        "ElementType": element_type,
        "PredefinedType": random.choice(["STANDARD", "USERDEFINED", None]),
        "Material": material,
        "Length_mm": random.choice([None, round(random.uniform(600, 7000), 1)]) 
                     if any(t in element_type for t in ["Wall", "Beam", "Column"]) else None,
        "Area_m2": random.choice([None, round(random.uniform(0.5, 60), 2)]) 
                   if any(t in element_type for t in ["Slab", "Wall", "Roof", "Covering"]) else None,
        "Volume_m3": random.choice([None, round(random.uniform(0.1, 25), 3)]) 
                     if any(t in element_type for t in ["Column", "Beam", "Wall"]) else None,
        "UnitQuantity": random.choice(UNITS),
        "Count": random_quantity(element_type),
        "UnitCost_ETB": random_cost(),
        "TotalCost_ETB": None,
        "InstallationDate": random_date(START_DATE, END_DATE).strftime("%Y-%m-%d"),
        "LastUpdated": random_date(START_DATE, END_DATE).strftime("%Y-%m-%d"),
        "RoomName": random.choice(["Living Room", "Bedroom", "Kitchen", "Bathroom", "Office", "Corridor", "Lobby", None]),
        "Level": f"Level {random.randint(0, 9)}",
        "ClashStatus": random.choice(["No Clash", "Minor Clash", "Major Clash", None])
    }
    
    # Calculate total cost
    if row["Count"] and row["UnitCost_ETB"]:
        row["TotalCost_ETB"] = round(row["Count"] * row["UnitCost_ETB"], 2)
    
    data.append(row)

df_bim = pd.DataFrame(data)

# Add realistic missing values (~12–18%)
for col in ["Length_mm", "Area_m2", "Volume_m3", "RoomName", "ClashStatus"]:
    mask = np.random.rand(len(df_bim)) < random.uniform(0.12, 0.18)
    df_bim.loc[mask, col] = np.nan

# Save
OUTPUT_FILE = DATA_DIR / "synthetic_bim_elements_2025.csv"
df_bim.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

print(f"Saved {len(df_bim)} synthetic BIM elements")
print(f"File: {OUTPUT_FILE}")

Saved 950 synthetic BIM elements
File: data\synthetic_bim\synthetic_bim_elements_2025.csv


#### Quick Exploration & Validation



In [4]:
print("Data shape:", df_bim.shape)
print("\nMissing values:")
print(df_bim.isna().sum())

Data shape: (950, 17)

Missing values:
GlobalId              0
Name                  0
ElementType           0
PredefinedType      340
Material              0
Length_mm           840
Area_m2             812
Volume_m3           836
UnitQuantity          0
Count                 0
UnitCost_ETB          0
TotalCost_ETB         0
InstallationDate      0
LastUpdated           0
RoomName            269
Level                 0
ClashStatus         363
dtype: int64


In [10]:
print("\nTop 5 element types:")
print(df_bim["ElementType"].value_counts().head(5))


Top 5 element types:
ElementType
IfcFurniture    79
IfcDoor         78
IfcSlab         70
IfcWindow       67
IfcBeam         67
Name: count, dtype: int64


In [6]:
print("\nPrice statistics (TotalCost_ETB):")
print(df_bim["TotalCost_ETB"].describe())


Price statistics (TotalCost_ETB):
count    9.500000e+02
mean     7.150054e+05
std      2.023596e+06
min      7.419600e+02
25%      7.126540e+04
50%      2.277565e+05
75%      6.334760e+05
max      3.899106e+07
Name: TotalCost_ETB, dtype: float64


In [7]:
print("\nInstallation date range:")
print(df_bim["InstallationDate"].min(), "→", df_bim["InstallationDate"].max())


Installation date range:
2020-01-03 → 2025-12-29


In [9]:
print("\nSample rows (first 5):")
display(df_bim.head(5))


Sample rows (first 5):


Unnamed: 0,GlobalId,Name,ElementType,PredefinedType,Material,Length_mm,Area_m2,Volume_m3,UnitQuantity,Count,UnitCost_ETB,TotalCost_ETB,InstallationDate,LastUpdated,RoomName,Level,ClashStatus
0,2$13356886$5506,Covering 251,IfcCovering,STANDARD,Brick Masonry,,,,set,95,9374.31,890559.45,2020-12-22,2024-09-24,Living Room,Level 0,No Clash
1,2$77827638$1434,DoorStandardCase 575,IfcDoorStandardCase,STANDARD,Porcelain Tile,,,,set,70,4106.21,287434.7,2024-09-14,2022-06-21,,Level 9,Major Clash
2,2$31429110$7924,Furniture 349,IfcFurniture,USERDEFINED,Concrete C25/30,,,,m,56,11407.14,638799.84,2023-10-10,2021-02-22,Bedroom,Level 6,No Clash
3,2$91030736$5333,WindowStandardCase 827,IfcWindowStandardCase,STANDARD,Wood - Teak,,,,set,59,35594.23,2100059.57,2021-05-26,2024-03-30,Bedroom,Level 8,Major Clash
4,2$87490893$4150,Space 722,IfcSpace,STANDARD,Wood - Teak,,,,m²,85,3624.97,308122.45,2022-07-22,2023-03-31,Bedroom,Level 3,No Clash
