# Malaria Risk Stratification with Algorithmic Fairness

This notebook implements the Tai & Dhaliwal (2022) wGRS+GF+POS methodology on synthetic MalariaGEN-like data to compare Ridge, LightGBM, and SVR models.

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.model_selection import train_test_split

sys.path.append(str(Path('..') / 'src'))

from synthetic_clinical_data import MalariaDataGenerator
from evolutionary_models import MalariaRiskPredictor

sns.set_theme(style='white', font='serif')
plt.rcParams['figure.dpi'] = 120


In [None]:
print('Generating synthetic clinical data (n=20,817)...')
generator = MalariaDataGenerator()
df = generator.generate()

display(df.head())
print(df['population'].value_counts(normalize=True).head())
print('Case rate:', df['case'].mean())


In [None]:
predictor = MalariaRiskPredictor()
X, y, feature_names = predictor.prepare_features(df)
groups = df['population'].values

X_train, X_test, y_train, y_test, grp_train, grp_test = train_test_split(
    X, y, groups, test_size=0.2, stratify=groups, random_state=42
)

print('Training set:', X_train.shape)
print('Test set:', X_test.shape)


In [None]:
ridge_results = predictor.train_ridge(X_train, y_train)
lgb_results = predictor.train_lightgbm(X_train, y_train)
svr_results = predictor.train_svr(X_train, y_train)

def evaluate(model_name, X, y):
    model = predictor.models[model_name]
    scaler = predictor.scalers.get(model_name)
    X_in = scaler.transform(X) if scaler else X
    preds = model.predict(X_in)
    mae = mean_absolute_error(y, preds)
    auc = roc_auc_score(y, preds)
    return mae, auc, preds

results = {}
for name in ['ridge', 'lightgbm', 'svr']:
    mae, auc, preds = evaluate(name, X_test, y_test)
    results[name] = {'mae': mae, 'auc': auc, 'preds': preds}

pd.DataFrame(results).T[['mae', 'auc']]


In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
models = list(results.keys())
maes = [results[m]['mae'] for m in models]
bars = ax.bar(models, maes, color=['#4C72B0', '#55A868', '#C44E52'], alpha=0.85)
ax.set_ylabel('Mean Absolute Error')
ax.set_title('Model Performance Comparison (Lower is Better)')
for bar, val in zip(bars, maes):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{val:.4f}',
            ha='center', va='bottom')
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
importance = lgb_results['feature_importance']
top_idx = np.argsort(importance)[-10:]
top_features = [feature_names[i] for i in top_idx]
top_values = importance[top_idx]

fig, ax = plt.subplots(figsize=(8, 6))
ax.barh(top_features, top_values, color='#4C72B0')
ax.set_xlabel('Gain Importance')
ax.set_title('Top 10 Features (LightGBM)')
sns.despine()
plt.tight_layout()
plt.show()


In [None]:
cv_results = ridge_results['cv_results']
alpha_values = cv_results['param_alpha'].data
mean_scores = -cv_results['mean_test_score']

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(alpha_values, mean_scores, marker='o')
ax.set_xscale('log')
ax.set_xlabel('Ridge Alpha (log scale)')
ax.set_ylabel('CV MAE')
ax.set_title('Ridge Hyperparameter Comparison')
sns.despine()
plt.tight_layout()
plt.show()

hyper_df = pd.DataFrame({
    'model': ['ridge', 'lightgbm', 'svr'],
    'primary_hyperparameter': [
        ridge_results['best_alpha'],
        31,
        1.0,
    ],
})

fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(hyper_df['model'], hyper_df['primary_hyperparameter'], color='#8172B2')
ax.set_ylabel('Selected Hyperparameter')
ax.set_title('Hyperparameter Comparison (Selected Values)')
sns.despine()
plt.tight_layout()
plt.show()
