In [7]:
import os, math, joblib, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder   , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, brier_score_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
try: import xgboost as xgb
except Exception: xgb = None
try: import lightgbm as lgb
except Exception: lgb = None

In [8]:
df = pd.read_csv('building_collapse_nigeria_mapped.csv')
df = df.copy()

In [9]:
cat_cols = ['material_quality','building_type','state_name','lga_mapped']
num_cols = ['building_age','num_floors','proximity_to_water','risk_score']
for c in cat_cols:
    if c in df.columns: df[c] = df[c].fillna('Unknown' if c!='building_type' else 'Residential')
for c in num_cols:
    if c in df.columns:
        if c=='num_floors': df[c] = df[c].fillna(1)
        elif c=='building_age': df[c] = df[c].fillna(df[c].median() if df[c].notna().any() else 10)
        elif c=='proximity_to_water': df[c] = df[c].fillna(df[c].median() if df[c].notna().any() else 200.0)
        else: df[c] = df[c].fillna(0.0)

# derived features used downstream
mq_map = {'Low':0,'Medium':1,'High':2}
df['material_quality_ord'] = df.get('material_quality', pd.Series()).map(mq_map).fillna(1)
df['prox_inv'] = 1.0 / (1.0 + df['proximity_to_water'])
df['prox_log'] = np.log1p(df['proximity_to_water'])
df['industrial_flag'] = (df.get('building_type','Residential') == 'Industrial').astype(int)
df['industrial_floors'] = df['industrial_flag'] * df['num_floors']
df['age_exposure'] = df['building_age'] * df['prox_inv']
df['floors_material'] = df['num_floors'] * df['material_quality_ord']

In [10]:
df['collapse_risk'] = (df['risk_score'] >= 0.5).astype(int)
features = ['state_name','lga_mapped','building_type','material_quality_ord','building_age',
            'num_floors','prox_log','prox_inv','industrial_flag','industrial_floors','age_exposure','floors_material']
X = df[features].copy(); y = df['collapse_risk']


In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [14]:
num_cols_pipe = ['material_quality_ord','building_age','num_floors','prox_log','prox_inv','industrial_flag','industrial_floors','age_exposure','floors_material']
ord_cols = ['state_name','lga_mapped']
ohe_cols = ['building_type']
preproc = ColumnTransformer([
    ('num', StandardScaler(), num_cols_pipe),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ord_cols),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_cols),
], remainder='drop')

In [None]:
candidates = {}
if xgb is not None:
    candidates['xgb'] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
if lgb is not None:
    candidates['lgb'] = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
candidates.update({
    'rf': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    'gb': GradientBoostingClassifier(random_state=42),
    'lr': LogisticRegression(max_iter=1000)
})
# quick CV to pick best model by f1 (3-fold to save time)
scores = {}
for name, mdl in candidates.items():
    try:
        pipe = Pipeline([('pre', preproc), ('clf', mdl)])
        cv = cross_val_score(pipe, X, y, cv=3, scoring='f1', n_jobs=-1)
        scores[name] = float(np.mean(cv))
    except Exception:
        scores[name] = -1.0
best_name = max(scores, key=scores.get)
best_model = candidates[best_name]


In [None]:
pipeline = Pipeline([('pre', preproc), ('clf', best_model)])
pipeline.fit(X_train, y_train)
probs_val = pipeline.predict_proba(X_val)[:,1] if hasattr(pipeline, 'predict_proba') else pipeline.predict(X_val)
ece = float(np.mean(np.abs( (y_val.groupby(pd.cut(probs_val, 10)).transform('mean') if False else np.array([])) ))) if False else None
# simple calibration decision: use Platt if brier improves
from sklearn.calibration import CalibratedClassifierCV
brier_base = brier_score_loss(y_val, probs_val)
calibrated = None
if len(y_train) >= 200:
    try:
        calib = CalibratedClassifierCV(base_estimator=best_model, method='sigmoid', cv=3)
        pipe_calib = Pipeline([('pre', preproc), ('clf', calib)])
        pipe_calib.fit(X_train, y_train)
        probs_val_c = pipe_calib.predict_proba(X_val)[:,1]
        if brier_score_loss(y_val, probs_val_c) <= brier_base:
            pipeline = pipe_calib
            calibrated = True
    except Exception:
        calibrated = False

In [None]:
joblib.dump({'pipeline': pipeline, 'model_name': best_name}, 'collapse_model_baseline.joblib')
preds = (pipeline.predict_proba(X_val)[:,1] >= 0.5).astype(int) if hasattr(pipeline, 'predict_proba') else pipeline.predict(X_val)
print('Model:', best_name, 'Val f1:', f1_score(y_val, preds))
print('Confusion matrix:\n', confusion_matrix(y_val, preds))

In [None]:
fn_mask = (y_val==1) & (preds==0)
fn_idx = list(X_val.index[fn_mask])
print('False negatives:', len(fn_idx))
# sensitivity sweep for proximity (sample up to 200 rows)
sample_idx = X_val.index.tolist()[:200]
prox_grid = np.linspace(0, 500, 51)
sweep_means = []
for p in prox_grid:
    rows = df.loc[sample_idx].copy()
    rows['proximity_to_water'] = p
    rows['prox_inv'] = 1.0/(1.0+p)
    rows['prox_log'] = np.log1p(p)
    rows['age_exposure'] = rows['building_age'].fillna(df['building_age'].median()) * rows['prox_inv']
    rows['industrial_flag'] = (rows['building_type'].fillna('Residential')=='Industrial').astype(int)
    rows['industrial_floors'] = rows['industrial_flag'] * rows['num_floors'].fillna(0)
    rows['floors_material'] = rows['num_floors'].fillna(0) * rows['material_quality_ord'].fillna(1)
    Xs = rows[features]
    probs = pipeline.predict_proba(Xs)[:,1] if hasattr(pipeline,'predict_proba') else pipeline.predict(Xs)
    sweep_means.append(probs.mean())
pd.DataFrame({'proximity_m': prox_grid, 'mean_prob': sweep_means}).to_csv('sensitivity_proximity.csv', index=False)
print('Wrote collapse_model_baseline.joblib and sensitivity_proximity.csv')