#### Imports & Load Data



In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

#Load synthetic data
DATA_DIR = Path("data/synthetic_bim")
INPUT_FILE = DATA_DIR / "synthetic_bim_elements_2025.csv"

if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Phase 1 file not found: {INPUT_FILE}")

df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} synthetic BIM elements")
display(df.head(5))
print("\nColumns:", df.columns.tolist())
print("\nMissing values:")
print(df.isna().sum())

Loaded 950 synthetic BIM elements


Unnamed: 0,GlobalId,Name,ElementType,PredefinedType,Material,Length_mm,Area_m2,Volume_m3,UnitQuantity,Count,UnitCost_ETB,TotalCost_ETB,InstallationDate,LastUpdated,RoomName,Level,ClashStatus
0,2$13356886$5506,Covering 251,IfcCovering,STANDARD,Brick Masonry,,,,set,95,9374.31,890559.45,2020-12-22,2024-09-24,Living Room,Level 0,No Clash
1,2$77827638$1434,DoorStandardCase 575,IfcDoorStandardCase,STANDARD,Porcelain Tile,,,,set,70,4106.21,287434.7,2024-09-14,2022-06-21,,Level 9,Major Clash
2,2$31429110$7924,Furniture 349,IfcFurniture,USERDEFINED,Concrete C25/30,,,,m,56,11407.14,638799.84,2023-10-10,2021-02-22,Bedroom,Level 6,No Clash
3,2$91030736$5333,WindowStandardCase 827,IfcWindowStandardCase,STANDARD,Wood - Teak,,,,set,59,35594.23,2100059.57,2021-05-26,2024-03-30,Bedroom,Level 8,Major Clash
4,2$87490893$4150,Space 722,IfcSpace,STANDARD,Wood - Teak,,,,m²,85,3624.97,308122.45,2022-07-22,2023-03-31,Bedroom,Level 3,No Clash



Columns: ['GlobalId', 'Name', 'ElementType', 'PredefinedType', 'Material', 'Length_mm', 'Area_m2', 'Volume_m3', 'UnitQuantity', 'Count', 'UnitCost_ETB', 'TotalCost_ETB', 'InstallationDate', 'LastUpdated', 'RoomName', 'Level', 'ClashStatus']

Missing values:
GlobalId              0
Name                  0
ElementType           0
PredefinedType      340
Material              0
Length_mm           840
Area_m2             812
Volume_m3           836
UnitQuantity          0
Count                 0
UnitCost_ETB          0
TotalCost_ETB         0
InstallationDate      0
LastUpdated           0
RoomName            269
Level                 0
ClashStatus         363
dtype: int64


#### Validation & Quality Report

In [2]:
def bim_quality_report(df):
    report = {}
    
    report["Total Elements"] = len(df)
    report["Unique GlobalIds"] = df["GlobalId"].nunique()
    report["Duplicate GlobalIds"] = df["GlobalId"].duplicated().sum()
    
    report["Missing Key Fields (%)"] = {
        "Name": df["Name"].isna().mean() * 100,
        "ElementType": df["ElementType"].isna().mean() * 100,
        "Material": df["Material"].isna().mean() * 100,
        "TotalCost_ETB": df["TotalCost_ETB"].isna().mean() * 100,
        "RoomName": df["RoomName"].isna().mean() * 100,
    }
    
    report["Invalid Quantities"] = ((df["Count"] <= 0) & df["Count"].notna()).sum()
    report["Negative Costs"] = (df["TotalCost_ETB"] < 0).sum()
    
    report["Date Issues"] = ((pd.to_datetime(df["InstallationDate"]) > pd.to_datetime(df["LastUpdated"])).sum())
    
    return pd.Series(report)

quality = bim_quality_report(df)
print("BIM Data Quality Report:")
display(quality)

# Flag invalid rows
df["ValidationFlags"] = ""
df.loc[df["GlobalId"].duplicated(), "ValidationFlags"] += "DuplicateGlobalId;"
df.loc[df["TotalCost_ETB"].isna(), "ValidationFlags"] += "MissingCost;"
df.loc[df["Count"] <= 0, "ValidationFlags"] += "InvalidCount;"
df.loc[pd.to_datetime(df["InstallationDate"]) > pd.to_datetime(df["LastUpdated"]), "ValidationFlags"] += "DateLogicError;"

BIM Data Quality Report:


Total Elements                                                          950
Unique GlobalIds                                                        950
Duplicate GlobalIds                                                       0
Missing Key Fields (%)    {'Name': 0.0, 'ElementType': 0.0, 'Material': ...
Invalid Quantities                                                        0
Negative Costs                                                            0
Date Issues                                                             449
dtype: object

#### Simulated Clash Detection

In [3]:
df["SimulatedClash"] = "No Clash"

clash_mask = (
    (df["RoomName"].notna()) &
    (df.duplicated(subset=["RoomName", "ElementType", "Material"], keep=False)) &
    (df["Count"] > 10) 
)

df.loc[clash_mask, "SimulatedClash"] = "Potential Clash (same type/material in room)"

print("Clash detection summary:")
print(df["SimulatedClash"].value_counts())

print("\nSample clashes:")
display(df[df["SimulatedClash"] != "No Clash"][["Name", "ElementType", "Material", "RoomName", "Count", "SimulatedClash"]].head(10))

Clash detection summary:
SimulatedClash
No Clash                                        766
Potential Clash (same type/material in room)    184
Name: count, dtype: int64

Sample clashes:


Unnamed: 0,Name,ElementType,Material,RoomName,Count,SimulatedClash
2,Furniture 349,IfcFurniture,Concrete C25/30,Bedroom,56,Potential Clash (same type/material in room)
4,Space 722,IfcSpace,Wood - Teak,Bedroom,85,Potential Clash (same type/material in room)
10,Window 004,IfcWindow,Aluminum,Bedroom,34,Potential Clash (same type/material in room)
12,BuildingStorey 247,IfcBuildingStorey,Granite Slab,Kitchen,11,Potential Clash (same type/material in room)
18,WallStandardCase 498,IfcWallStandardCase,Porcelain Tile,Lobby,119,Potential Clash (same type/material in room)
23,BuildingStorey 292,IfcBuildingStorey,Reinforced Concrete,Living Room,107,Potential Clash (same type/material in room)
28,Furniture 184,IfcFurniture,Gypsum Board,Kitchen,106,Potential Clash (same type/material in room)
29,Space 205,IfcSpace,Concrete C30/37,Living Room,40,Potential Clash (same type/material in room)
39,Beam 785,IfcBeam,Marble Slab,Corridor,37,Potential Clash (same type/material in room)
40,Door 711,IfcDoor,AAC Block,Lobby,28,Potential Clash (same type/material in room)


#### Anomaly Detection (Isolation Forest on cost & quantity)

In [4]:
features = ["Count", "TotalCost_ETB"]

df[features] = df[features].fillna(df[features].median())

iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.08,  
    random_state=42
)

df["CostQuantityAnomaly"] = iso_forest.fit_predict(df[features])

df["CostQuantityAnomaly"] = df["CostQuantityAnomaly"].map({1: "Normal", -1: "Anomaly"})

print("Anomaly detection results:")
print(df["CostQuantityAnomaly"].value_counts(normalize=True) * 100)

print("\nTop 10 anomalies (highest cost outliers):")
display(df[df["CostQuantityAnomaly"] == "Anomaly"]
        .sort_values("TotalCost_ETB", ascending=False)
        .head(10)[["Name", "ElementType", "Material", "TotalCost_ETB", "Count", "CostQuantityAnomaly"]])

Anomaly detection results:
CostQuantityAnomaly
Normal     92.0
Anomaly     8.0
Name: proportion, dtype: float64

Top 10 anomalies (highest cost outliers):


Unnamed: 0,Name,ElementType,Material,TotalCost_ETB,Count,CostQuantityAnomaly
209,Furniture 170,IfcFurniture,Epoxy Coating,38991056.43,53,Anomaly
478,Column 739,IfcColumn,Brick Masonry,26902109.0,100,Anomaly
113,Furniture 245,IfcFurniture,Glass - Clear 6mm,14621525.05,121,Anomaly
847,Roof 034,IfcRoof,Glass - Tinted 8mm,13634154.9,114,Anomaly
614,WallStandardCase 506,IfcWallStandardCase,Granite Slab,13229721.12,112,Anomaly
179,Column 633,IfcColumn,Glass - Clear 6mm,11644288.89,69,Anomaly
762,BuildingStorey 827,IfcBuildingStorey,Paint - Emulsion White,10415891.96,76,Anomaly
880,DoorStandardCase 029,IfcDoorStandardCase,Paint - Acrylic Grey,9188905.6,70,Anomaly
641,Furniture 484,IfcFurniture,Steel - Galvanized,8391479.85,195,Anomaly
374,Column 918,IfcColumn,Glass - Tinted 8mm,7994441.07,99,Anomaly


#### Enrich & Save



In [5]:
# Add derived features
df["CostPerUnit"] = df["TotalCost_ETB"] / df["Count"].replace(0, np.nan)
df["IsHighValue"] = df["TotalCost_ETB"] > df["TotalCost_ETB"].quantile(0.90)

# Save
OUTPUT_ENRICHED = DATA_DIR / "synthetic_bim_enriched_2025.csv"
df.to_csv(OUTPUT_ENRICHED, index=False, encoding="utf-8-sig")

print(f"Enriched dataset saved: {OUTPUT_ENRICHED}")
print(f"Rows: {len(df)}")
display(df.head(5))

Enriched dataset saved: data\synthetic_bim\synthetic_bim_enriched_2025.csv
Rows: 950


Unnamed: 0,GlobalId,Name,ElementType,PredefinedType,Material,Length_mm,Area_m2,Volume_m3,UnitQuantity,Count,...,InstallationDate,LastUpdated,RoomName,Level,ClashStatus,ValidationFlags,SimulatedClash,CostQuantityAnomaly,CostPerUnit,IsHighValue
0,2$13356886$5506,Covering 251,IfcCovering,STANDARD,Brick Masonry,,,,set,95,...,2020-12-22,2024-09-24,Living Room,Level 0,No Clash,,No Clash,Normal,9374.31,False
1,2$77827638$1434,DoorStandardCase 575,IfcDoorStandardCase,STANDARD,Porcelain Tile,,,,set,70,...,2024-09-14,2022-06-21,,Level 9,Major Clash,DateLogicError;,No Clash,Normal,4106.21,False
2,2$31429110$7924,Furniture 349,IfcFurniture,USERDEFINED,Concrete C25/30,,,,m,56,...,2023-10-10,2021-02-22,Bedroom,Level 6,No Clash,DateLogicError;,Potential Clash (same type/material in room),Normal,11407.14,False
3,2$91030736$5333,WindowStandardCase 827,IfcWindowStandardCase,STANDARD,Wood - Teak,,,,set,59,...,2021-05-26,2024-03-30,Bedroom,Level 8,Major Clash,,No Clash,Normal,35594.23,True
4,2$87490893$4150,Space 722,IfcSpace,STANDARD,Wood - Teak,,,,m²,85,...,2022-07-22,2023-03-31,Bedroom,Level 3,No Clash,,Potential Clash (same type/material in room),Normal,3624.97,False
