#### Load Enriched Data


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

DATA_DIR = Path("data/synthetic_bim")
ENRICHED_FILE = DATA_DIR / "synthetic_bim_enriched_2025.csv"

df = pd.read_csv(ENRICHED_FILE)
print(f"Loaded {len(df)} enriched BIM elements")
print("Columns:", df.columns.tolist())

# Convert date columns to datetime first
df["InstallationDate"] = pd.to_datetime(df["InstallationDate"], errors='coerce')
df["LastUpdated"]   = pd.to_datetime(df["LastUpdated"],   errors='coerce')

# Calculate days since reference date (2020-01-01)
reference_date = pd.to_datetime("2020-01-01")
df["InstallationDays"]   = (df["InstallationDate"]   - reference_date).dt.days
df["LastUpdatedDays"]    = (df["LastUpdated"]    - reference_date).dt.days

# Optional: fill any NaT (invalid dates) with median days
median_install = df["InstallationDays"].median()
median_update  = df["LastUpdatedDays"].median()
df["InstallationDays"] = df["InstallationDays"].fillna(median_install)
df["LastUpdatedDays"]  = df["LastUpdatedDays"].fillna(median_update)

# Drop original date columns (no longer needed)
df = df.drop(columns=["InstallationDate", "LastUpdated"], errors="ignore")

# Drop unnecessary identifier columns
drop_cols = ["GlobalId", "Name", "ValidationFlags", "SimulatedClash", "CostQuantityAnomaly"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Target columns we want to predict
targets = ["Area_m2", "Volume_m3", "TotalCost_ETB"]

# Features: numeric + categorical to encode
numeric_cols = df.select_dtypes(include=["number"]).columns.drop(targets, errors="ignore").tolist()
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print("Sample numeric columns:", numeric_cols[:8])
print("Sample categorical columns:", categorical_cols[:5])

Loaded 950 enriched BIM elements
Columns: ['GlobalId', 'Name', 'ElementType', 'PredefinedType', 'Material', 'Length_mm', 'Area_m2', 'Volume_m3', 'UnitQuantity', 'Count', 'UnitCost_ETB', 'TotalCost_ETB', 'InstallationDate', 'LastUpdated', 'RoomName', 'Level', 'ClashStatus', 'ValidationFlags', 'SimulatedClash', 'CostQuantityAnomaly', 'CostPerUnit', 'IsHighValue']
Numeric features: 6
Categorical features: 7
Sample numeric columns: ['Length_mm', 'Count', 'UnitCost_ETB', 'CostPerUnit', 'InstallationDays', 'LastUpdatedDays']
Sample categorical columns: ['ElementType', 'PredefinedType', 'Material', 'UnitQuantity', 'RoomName']


#### One-hot encode categoricals & impute missing values

In [2]:
# Impute numeric features (median)
numeric_imputer = SimpleImputer(strategy="median")
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

# One-hot encode categoricals (sparse=False for numpy compatibility)
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first")
encoded_cats = encoder.fit_transform(df[categorical_cols])

# Combine numeric + encoded categoricals
X = np.hstack([df[numeric_cols].values, encoded_cats])

# Feature names for reference
feature_names = list(numeric_cols) + list(encoder.get_feature_names_out(categorical_cols))

print(f"Final feature matrix shape: {X.shape}")
print("First 10 feature names:", feature_names[:10])

Final feature matrix shape: (950, 65)
First 10 feature names: ['Length_mm', 'Count', 'UnitCost_ETB', 'CostPerUnit', 'InstallationDays', 'LastUpdatedDays', 'ElementType_IfcBuildingStorey', 'ElementType_IfcColumn', 'ElementType_IfcCovering', 'ElementType_IfcDoor']


#### Predict missing Area_m2 (Random Forest Regressor)

In [3]:
# Rows with known Area_m2
known_mask = df["Area_m2"].notna()
X_area = X[known_mask]
y_area = df.loc[known_mask, "Area_m2"]

print(f"Training on {len(y_area)} known Area_m2 values")

rf_area = RandomForestRegressor(n_estimators=100, random_state=42)
rf_area.fit(X_area, y_area)

# Predict missing
missing_mask = df["Area_m2"].isna()
df.loc[missing_mask, "Area_m2_pred"] = rf_area.predict(X[missing_mask])

# MAE on known data
mae_area = mean_absolute_error(y_area, rf_area.predict(X_area))
print(f"Area_m2 prediction MAE (on known): {mae_area:.2f} m²")

Training on 138 known Area_m2 values
Area_m2 prediction MAE (on known): 5.23 m²


#### Predict Volume_m3

In [4]:
known_mask_vol = df["Volume_m3"].notna()
X_vol = X[known_mask_vol]
y_vol = df.loc[known_mask_vol, "Volume_m3"]

print(f"Training on {len(y_vol)} known Volume_m3 values")

rf_vol = RandomForestRegressor(n_estimators=100, random_state=42)
rf_vol.fit(X_vol, y_vol)

missing_mask_vol = df["Volume_m3"].isna()
df.loc[missing_mask_vol, "Volume_m3_pred"] = rf_vol.predict(X[missing_mask_vol])

mae_vol = mean_absolute_error(y_vol, rf_vol.predict(X_vol))
print(f"Volume_m3 prediction MAE (on known): {mae_vol:.2f} m³")

Training on 114 known Volume_m3 values
Volume_m3 prediction MAE (on known): 2.24 m³


#### Predict TotalCost_ETB

In [5]:
# Use predicted Area/Volume where missing
df["Area_m2"]   = df["Area_m2"].fillna(df.get("Area_m2_pred", np.nan))
df["Volume_m3"] = df["Volume_m3"].fillna(df.get("Volume_m3_pred", np.nan))

# Prepare features for cost prediction
cost_features = [col for col in numeric_cols if col != "TotalCost_ETB"]
cost_features += ["Area_m2", "Volume_m3"]  # add predicted quantities

# Rows with known costs (for training)
known_mask_cost = df["TotalCost_ETB"].notna()

if known_mask_cost.sum() == 0:
    print("No known TotalCost_ETB values → skipping cost prediction.")
    df["TotalCost_ETB_pred"] = np.nan
else:
    X_cost = df.loc[known_mask_cost, cost_features]
    y_cost = df.loc[known_mask_cost, "TotalCost_ETB"]

    print(f"Training cost model on {len(y_cost)} known values")

    rf_cost = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_cost.fit(X_cost, y_cost)

    # Predict missing costs (only if any exist)
    missing_cost_mask = df["TotalCost_ETB"].isna()
    if missing_cost_mask.sum() > 0:
        df.loc[missing_cost_mask, "TotalCost_ETB_pred"] = rf_cost.predict(df.loc[missing_cost_mask, cost_features])
        print(f"Predicted {missing_cost_mask.sum()} missing costs")
    else:
        print("No missing TotalCost_ETB values – copying known costs to _pred column")
        df["TotalCost_ETB_pred"] = df["TotalCost_ETB"]

    # MAE on known data
    mae_cost = mean_absolute_error(y_cost, rf_cost.predict(X_cost))
    print(f"Cost prediction MAE (on known data): {mae_cost:,.0f} ETB")

# Safe preview (use columns that actually exist)
preview_cols = ["ElementType", "TotalCost_ETB", "TotalCost_ETB_pred", "Area_m2", "Volume_m3"]
preview_cols = [c for c in preview_cols if c in df.columns]  # safe filter

print("\nSample final data (first 5 rows):")
display(df[preview_cols].head(5))

# Save final enriched dataset for dashboard
OUTPUT_PHASE3 = DATA_DIR / "synthetic_bim_ready_2025.csv"
df.to_csv(OUTPUT_PHASE3, index=False, encoding="utf-8-sig")

print(f"Dataset ready for dashboard: {OUTPUT_PHASE3}")
print(f"Rows: {len(df)}")

Training cost model on 950 known values
No missing TotalCost_ETB values – copying known costs to _pred column
Cost prediction MAE (on known data): 43,435 ETB

Sample final data (first 5 rows):


Unnamed: 0,ElementType,TotalCost_ETB,TotalCost_ETB_pred,Area_m2,Volume_m3
0,IfcCovering,890559.45,890559.45,27.5602,11.98683
1,IfcDoorStandardCase,287434.7,287434.7,33.6713,14.0918
2,IfcFurniture,638799.84,638799.84,24.9801,9.27411
3,IfcWindowStandardCase,2100059.57,2100059.57,25.5102,12.98173
4,IfcSpace,308122.45,308122.45,23.4122,11.12447


Dataset ready for dashboard: data\synthetic_bim\synthetic_bim_ready_2025.csv
Rows: 950
