# 03 — Model Comparison

Compare Logistic Regression, RandomForest, and XGBoost on the same split.

In [None]:
import sys
sys.path.append('..')
from pathlib import Path
from src.model import train_model as train_logit
from src.model_rf import train_model as train_rf
from src.model_xgb import train_model as train_xgb
from src.utils import save_json
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

DATA_PATH = '../data/synthetic_households.csv'
logit, (X_test, y_test) = train_logit(DATA_PATH, model_out='../outputs/model.pkl')
rf, _ = train_rf(DATA_PATH, model_out='../outputs/model_rf.pkl')
xgb, _ = train_xgb(DATA_PATH, model_out='../outputs/model_xgb.pkl')

def auc_curve(pipe):
    y_score = pipe.predict_proba(X_test)[:,1]
    from sklearn.metrics import roc_auc_score, roc_curve
    auc = float(roc_auc_score(y_test, y_score))
    fpr, tpr, _ = roc_curve(y_test, y_score)
    return auc, fpr, tpr

a1,f1,t1 = auc_curve(logit)
a2,f2,t2 = auc_curve(rf)
a3,f3,t3 = auc_curve(xgb)

plt.figure()
plt.plot(f1,t1,label=f'Logit (AUC={a1:.3f})')
plt.plot(f2,t2,label=f'RF (AUC={a2:.3f})')
plt.plot(f3,t3,label=f'XGB (AUC={a3:.3f})')
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.savefig('../outputs/roc_compare.png', bbox_inches='tight')
plt.close()
print({'Logit': a1, 'RF': a2, 'XGB': a3})