<a href="https://colab.research.google.com/github/acoiman/pdt/blob/main/asthma_mortality/notebooks/Python/08_Asthma_Mortality_RF_RPIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Asthma Mortality in Argentina using Remote Sensing Data and Machine Learning



In this notebook, we will use remote sensing data and Random Forest (RF) to predict asthma mortality in Argentina at departmental level from 2001 to 2022. We will model the Normalized Asthma Mortality Rate (NAMR) in a two-stage RF approach—classification followed by regression—using predictor variables derived from satellite-based observations such as burned areas, and Particulate Matter with 2.5 micrometers in diameter or less (PM2.5), along with Population Density (PD), and lagged and feature engineered variables.

Import required libraries

In [None]:
# dataframe libraries
import pandas as pd
import numpy as np

# geospatial libraries
import geopandas as gpd
import mapclassify
from libpysal.weights import Queen
from esda.moran import Moran
from pysal.explore import esda

# plot libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from matplotlib.patches import Patch
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score,explained_variance_score,median_absolute_error, max_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# other libraries
import os
from joblib import Parallel, delayed
import shap
from itables import init_notebook_mode, show

## Load and reduce data

In [None]:
%cd work/

In [None]:
# Set the PROJ_LIB path
os.environ['PROJ_LIB'] = "/opt/conda/envs/gds/share/proj"

In [None]:
# Load dataset with data per department
gdf = gpd.read_file("pdt/asthma_mortality/data/gpkg/tma_pm25_ba_pd_pdpm25_2001_2022.gpkg")

In [None]:
# Drop geometry
df = gdf.drop(columns="geometry")

In [None]:
# Reshape df to long format
years = range(2001, 2023)
records = []

In [None]:
for _, row in df.iterrows():
    iddpto = row["IDDPTO"]
    for year in years:
        records.append({
            "IDDPTO": iddpto,
            "YEAR": year,
            "CA": row.get(f"CA_{year}", np.nan),
            "PM25": row.get(f"PM25_{year}", np.nan),
            "NBA": row.get(f"NBA_{year}", np.nan),
            "PD": row.get(f"PD_{year}", np.nan),
            "PDPM25": row.get(f"PDPM25_{year}", np.nan)
            })

In [None]:
# create new df from list and sort
panel_df = pd.DataFrame(records)

In [None]:
# Sort and reset index
panel_df = panel_df.sort_values(by=["IDDPTO", "YEAR"]).reset_index(drop=True)

## Exploratory Data Analysis (EDA)

In [None]:
# Create a copy of the panel_df DataFrame for exploratory data analysis (EDA)
df_eda = panel_df.copy()

In [None]:
# Convert the 'YEAR' column in panel_df to datetime format and assign it to df_eda
df_eda['YEAR'] = pd.to_datetime(panel_df['YEAR'], format='%Y')

In [None]:
# visualize the first rows
init_notebook_mode(all_interactive=True)
show(df_eda)

In [None]:
# get the number of rows
len(df_eda)

In [None]:
# Filter the year 2022 (Test set)
df_eda_2022 = df_eda[df_eda['YEAR'] == pd.to_datetime(2022, format='%Y')]

In [None]:
# visualize the first rows
init_notebook_mode(all_interactive=False)
df_eda_2022.CA.describe()

In [None]:
# create a box plot for CA column
sns.boxplot(x=df_eda_2022['CA'])
plt.show()

In [None]:
# create a barplot showing the distribution  pf CA column
sns.histplot(df_eda_2022['CA'], bins=25)
plt.xlabel('CA')
plt.ylabel('Frequency')
plt.title('Distribution of NAMR')
plt.xlabel("NAMR")
plt.show()

In [None]:
# how many zero values are there for CA column
df_eda_2022[df_eda_2022['CA'] == 0].shape[0]

In [None]:
# in percentage
round(((df_eda_2022[df_eda_2022['CA'] == 0].shape[0] / len(df_eda_2022)) * 100), 2)

In [None]:
# Filter out the year 2022
df_eda_0121 = df_eda[df_eda['YEAR'] < pd.to_datetime(2022, format='%Y')]

In [None]:
# Set the 'YEAR' column as the index for the DataFrame
df_eda_0121.set_index('YEAR', inplace=True)

In [None]:
# Plot mortality rate over time
plt.figure(figsize=(6, 4))
sns.lineplot(data=df_eda_0121, x=df_eda_0121.index, y='CA', marker='o', estimator='mean')
plt.title('Mean Mortality Rate Over Time')
plt.ylabel('Mean NAMR')
plt.xlabel('Year')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# ACF and PACF plots
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

plot_pacf(df_eda_0121['CA'], ax=axes[0], lags=21, title='Partial Autocorrelation (PACF)')
plot_acf(df_eda_0121['CA'], ax=axes[1], lags=21, title='Autocorrelation (ACF)')

plt.tight_layout()
plt.show()

### Exploratory Spatial  Data Analysis (ESDA)

In [None]:
# Load dataset with data per department
gdf = gpd.read_file("pdt/asthma_mortality/data/gpkg/tma_pm25_ba_pd_pdpm25_2001_2022.gpkg")

In [None]:
# Reshape gdf to long format
years = range(2001, 2023)
records = []

In [None]:
for _, row in gdf.iterrows():
    iddpto = row["IDDPTO"]
    geometry = row["geometry"]
    for year in years:
        records.append({
            "IDDPTO": iddpto,
            "YEAR": year,
            "CA": row.get(f"CA_{year}", np.nan),
            "PM25": row.get(f"PM25_{year}", np.nan),
            "NBA": row.get(f"NBA_{year}", np.nan),
            "PD": row.get(f"PD_{year}", np.nan),
            "PDPM25": row.get(f"PDPM25_{year}", np.nan),
            "geometry": geometry # Add geometry
            })

In [None]:
# create new df from list and sort
panel_gdf = pd.DataFrame(records)

In [None]:
# Sort and reset index
panel_gdf = panel_gdf.sort_values(by=["IDDPTO", "YEAR"]).reset_index(drop=True)

In [None]:
# visualize the fisrt rows
init_notebook_mode(all_interactive=True)
show(panel_gdf)

In [None]:
# Filter the DataFrame to include rows where 'YEAR' is less than 2022 and create a copy
gdf_esda_0121 = panel_gdf[panel_gdf['YEAR'] < 2022].copy()

In [None]:
# Define spatial weights matrix using Queen contiguity
w = Queen.from_dataframe(gdf_esda_0121)
w.transform = 'R'

In [None]:
# Compute Moran's I for each mean feature
moran_results = {}
for col in ['CA', 'PM25', 'NBA', 'PD', 'PDPM25']:
    moran = Moran(gdf_esda_0121[col], w)
    moran_results[col] = {'Moran_I': moran.I, 'Moran_p_sim': moran.p_sim}
# Create a table (DataFrame) from the results
moran_df = pd.DataFrame.from_dict(moran_results, orient='index')

In [None]:
# Display the table
init_notebook_mode(all_interactive=True)
show(moran_df)

## Part 1 – RF Classification Model

In this section, we will train and evaluate a Random Forest (RF) classification model to predict whether the Normalized Asthma Mortality Rate (NAMR, represented by the variable CA) indicates the absence (0) or presence (1) of asthma mortality (binary classification). We will apply a walk-forward (expanding window) validation approach, which is appropriate for epidemiological studies involving time series data¹. We will start by training the RF classification model using data from 2001 to 2006 (a 5-year window) and testing it with data from 2007. The training window will then be expanded by one year at each iteration until it spans from 2001 to 2021, with 2022 data used for testing.


### Training and Testing a RF Classification Model

In [None]:
# Create a copy of the DataFrame to preserve the original data
df_ts = panel_df.copy()

In [None]:
# Create lag variables (up to 2 years)
def create_lags(df, var, max_lag=2):
    for lag in range(1, max_lag+1):
        df[f"{var}_lag{lag}"] = df[var].shift(lag)
    return df

In [None]:
for var in ["PM25", "NBA", "PD", "PDPM25"]:
    df_ts = create_lags(df_ts, var)

In [None]:
for var in ["CA"]:
    df_ts = create_lags(df_ts, var,  max_lag=2)

In [None]:
# Drop the initial rows with NaNs due to lagging
df_ts = df_ts.dropna().reset_index(drop=True)

In [None]:
# Binary target
df_ts['CA_bin'] = (df_ts['CA'] > 0).astype(int)

In [None]:
# visualize data frame
init_notebook_mode(all_interactive=True)
show(df_ts)

In [None]:
results = []  # results list

for i in range(2006, 2022):  # start walk-forward from year 2006 to 2021
    train_years = list(range(2001, i + 1))  # expanding window
    test_year = i + 1
    print(f"Training: {train_years[0]}–{train_years[-1]}, Testing: {test_year}")

    # Split train/test by year
    train_df = df_ts[df_ts['YEAR'].isin(train_years)]
    test_df = df_ts[df_ts['YEAR'] == test_year]

    # Define features and target
    features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
                'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
                'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']
    target = 'CA_bin'

    X_train = train_df[features]
    y_train = train_df[target]
    X_test = test_df[features]
    y_test = test_df[target]

    # Train classification model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict (labels)
    y_pred = model.predict(X_test)

    # Classification metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Store results
    results.append({
        'train_years': f"{train_years[0]}-{train_years[-1]}",
        'test_year': test_year,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1_Score': f1
    })

In [None]:
# convert results into a dataframe
results_df = pd.DataFrame(results)

In [None]:
# visualize the results
init_notebook_mode(all_interactive=True)
results_df

In [None]:
# Calculate and display the mean and standard deviation for each evaluation metric
mean_metrics = results_df[['Accuracy', 'Precision', 'Recall', 'F1_Score']].mean()
std_metrics = results_df[['Accuracy', 'Precision', 'Recall', 'F1_Score']].std()

print("Mean of Evaluation Metrics:")
print(mean_metrics)
print("\nStandard Deviation of Evaluation Metrics:")
print(std_metrics)

Metric Value	Interpretation
* Accuracy	0.775 ± 0.012.	On average, the classifier correctly identified whether CA was 0 or 1 about 77.5% ± 1.2% of the time. This is decent but may be misleading if there is class imbalance (e.g., many zeros).
* Precision	0.6770 ±  0.039.	On average, of all the cases where the classifier predicted CA > 0, 67.7% ± 3.9% were correct. Upper moderate precision means some false positives (it sometimes predicts CA as 1 when the true value is 0).
* Recall 0.611 ± 0.047.	On average, the model only identified 61.1%  ± 4.7% of true CA > 0 cases .  So it's missing nearly a third of the true positives (false negatives are moderate).
* F1 Score	0.638 ± 0.030.	The harmonic mean of precision and recall. This moderate value indicates a trade-off between missing positives and over-predicting them.

In [None]:
# create a plot of each metric by year:
fig, ax = plt.subplots(figsize=(10, 6))
for metric in ['Accuracy', 'Precision', 'Recall', 'F1_Score']:
    ax.plot(results_df['test_year'], results_df[metric], marker='o', label=metric)

ax.set_xlabel("Test Year")
ax.set_ylabel("Score")
ax.set_title("Walk-Forward Validation Metrics (2007–2022)")
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()

### RF Classification Model Parameter Tuning

In [None]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
results = []  # clear results list before starting

for i in range(2006, 2022):  # walk-forward from 2006 to 2021
    train_years = list(range(2001, i + 1))  # expanding window
    test_year = i + 1
    print(f"Training: {train_years[0]}–{train_years[-1]}, Testing: {test_year}")

    # Split train/test by year
    train_df = df_ts[df_ts['YEAR'].isin(train_years)]
    test_df = df_ts[df_ts['YEAR'] == test_year]

    # Define features and target
    features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
                'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
                'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']
    target = 'CA_bin'

    X_train = train_df[features]
    y_train = train_df[target]
    X_test = test_df[features]
    y_test = test_df[target]

    # Grid Search with 3-fold CV
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid,
        scoring='recall',
        cv=3,
        n_jobs=-1,
        verbose=0
    )

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predict on test set
    y_pred = best_model.predict(X_test)

    # Evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Store results and best params
    results.append({
        'train_years': f"{train_years[0]}-{train_years[-1]}",
        'test_year': test_year,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1_Score': f1,
        'Best_Params': grid_search.best_params_
    })


In [None]:
# convert results into a dataframe
resultspt_df = pd.DataFrame(results)

In [None]:
# visualize dataframe
init_notebook_mode(all_interactive=True)
resultspt_df

In [None]:
# Calculate and display the mean and standard deviation for each evaluation metric
mean_metrics = resultspt_df[['Accuracy', 'Precision', 'Recall', 'F1_Score']].mean()
std_metrics = resultspt_df[['Accuracy', 'Precision', 'Recall', 'F1_Score']].std()

print("Mean of Evaluation Metrics:")
print(mean_metrics)
print("\nStandard Deviation of Evaluation Metrics:")
print(std_metrics)

Metric Value	Interpretation
* Accuracy	0.776 ± 0.017.	On average, the classifier correctly identified whether CA was 0 or 1 about 77.6% ± 1.7% of the time. This is decent but may be misleading if there is class imbalance (e.g., many zeros).
* Precision	0.678 ±  0.040.	On average, of all the cases where the classifier predicted CA > 0, 67.8% ± 4% were correct. Upper moderate precision means some false positives (it sometimes predicts CA as 1 when the true value is 0).
* Recall 0.600 ± 0.047.	On average, the model only identified 60%  ± 4.7% of true CA > 0 cases .  So it's missing nearly a third of the true positives (false negatives are moderate).
* F1 Score	0.635 ± 0.032.	The harmonic mean of precision and recall. This moderate value indicates a trade-off between missing positives and over-predicting them.

**Note:** Parameter tuning did not improve substantially model performance

In [None]:
# create a plot of each metric by year:
fig, ax = plt.subplots(figsize=(10, 6))
for metric in ['Accuracy', 'Precision', 'Recall', 'F1_Score']:
    ax.plot(results_df['test_year'], resultspt_df[metric], marker='o', label=metric)

ax.set_xlabel("Test Year")
ax.set_ylabel("Score")
ax.set_title("Walk-Forward Validation Metrics (2007–2022) with Parameter Tuning")
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()

### Training RF Classification Model on 2001–2021 and Predict 2022

In this section, for the SHAP analysis, we will train the RF classification model on data from 2001–2021 and predict, for 2022, whether the Normalized Asthma Mortality Rate (NAMR, represented by the variable CA) indicates absence (0) or presence (1) of asthma mortality (binary classification)

In [None]:
# Define features and target
features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
            'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
            'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']
target = 'CA_bin'

In [None]:
# Training and test sets
train_df = df_ts[df_ts['YEAR'] <= 2021].dropna(subset=features + [target])
test_df = df_ts[df_ts['YEAR'] == 2022].dropna(subset=features + [target])

In [None]:
# Define inputs
X_train = train_df[features]
y_train = train_df[target]

# Keep IDDPTO in test set
X_test_full = test_df[['IDDPTO'] + features+ ['CA_bin']].copy()
y_test = test_df[target].reset_index(drop=True)

In [None]:
# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Predict using only feature columns
y_pred = clf.predict(X_test_full[features])

#  Add prediction to test set with IDDPTO
X_test_full['CA_bin_pred'] = y_pred

In [None]:
# Evaluate prediction
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Print metrics
print(f"Prediction Results for 2022:")
print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")

Metric Value	Interpretation
* Accuracy	0.781.	On average, the classifier correctly predicted whether CA was 0 or 1 in 2022 in about 78.1% of the time. This is decent but may be misleading if there is class imbalance (e.g., many zeros).
* Precision	0.720.	On average, of all the cases where the classifier predicted CA > 0 in 2022, 72 % were correct. This precision means a few false positives (it sometimes predicts CA as 1 when the true value is 0).
* Recall 0.559.	On average, the model only identified 55.9%  of true CA > 0 cases.  So it's missing nearly a half of the true positives (false negatives are high).
* F1 Score	0.629.	The harmonic mean of precision and recall. This moderate value indicates a trade-off between missing positives and over-predicting them.

In [None]:
# Preserve only IDDPTO, CA_bin, and CA_bin_pred from X_test_full
bin_pred_result = X_test_full[['IDDPTO', 'CA_bin', 'CA_bin_pred']].copy()

In [None]:
# Display the new DataFrame
init_notebook_mode(all_interactive=True)
show(bin_pred_result)

### SHAP (SHapley Additive Explanations)

In this section we will interpret the contribution of each independent variable to the final predition of the RF classification model using the SHAP method

In [None]:
# ---- SHAP for Binary Classifier ----
# Use TreeExplainer (optimized for RandomForest)
explainer = shap.TreeExplainer(clf)

In [None]:
# get the test set
X = test_df[features]

In [None]:
# Function to compute SHAP values for a single row
def compute_shap(row):
    return explainer.shap_values(row)

In [None]:
# Parallel computation
shap_values_list = Parallel(n_jobs=-1)(
    delayed(compute_shap)(X.iloc[[i]]) for i in range(len(X))
)

In [None]:
# Combine SHAP values for Class 1 into a single array
shap_values = np.vstack([vals[0][:, 1] for vals in shap_values_list])  # Class 1 SHAPs


In [None]:
# Generate a SHAP summary plot to visualize the impact of features on the model's predictions
shap.summary_plot(shap_values, X, feature_names=features,
                  show= False)
plt.title("(a) SHAP Summary Plot RF Classification", fontsize=17)
plt.xlabel("SHAP Values", size=12)
plt.ylabel("Features", size=12)
plt.ylabel("Features")
plt.show();

##Part 2 – Regression Model


In this section, we will train and evaluate a Random Forest (RF) regression model to predict NAMR values where it is present  (CA_bin == 1 or NAMR values > 0). We will apply a walk-forward (expanding window) validation approach, which is appropriate for epidemiological studies involving time series data¹. We will start by training the RF regression model using data from 2001 to 2006 (a 5-year window) and testing it with data from 2007. The training window will then be expanded by one year at each iteration until it spans from 2001 to 2021, with 2022 data used for testing.

In [None]:
# Filter bin_pred_result to keep only rows where CA_bin is 1
bin_positive = df_ts[(df_ts['CA_bin'] == 1)]

In [None]:
# Display the filtered DataFrame
init_notebook_mode(all_interactive=True)
show(bin_positive)

### Training and Testing a RF Regression Model

In [None]:
results = []  # store results

for i in range(2006, 2022):  # walk-forward from 2006 to 2021
    train_years = list(range(2001, i + 1))  # expanding window
    test_year = i + 1
    print(f"Training: {train_years[0]}–{train_years[-1]}, Testing: {test_year}")

    # Filter by year
    train_df = df_ts[df_ts['YEAR'].isin(train_years)]
    test_df = df_ts[df_ts['YEAR'] == test_year]

    # Filter by CA_bin == 1
    train_pos = train_df[train_df['CA_bin'] == 1].copy()
    test_pos = test_df[test_df['CA_bin'] == 1].copy()

    # Skip iteration if empty (avoid errors)
    if train_pos.empty or test_pos.empty:
        print(f"Skipped: No positive cases in train or test for {test_year}")
        continue

    # Features and regression target
    features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
                'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
                'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']
    target = 'CA'  # regression target

    X_train = train_pos[features]
    y_train = train_pos[target]
    X_test = test_pos[features]
    y_test = test_pos[target]

    # Train regression model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    # Save results
    results.append({
        'train_years': f"{train_years[0]}-{train_years[-1]}",
        'test_year': test_year,
        'R2': r2
    })


In [None]:
# convert the results into a data frame
results_df = pd.DataFrame(results)

In [None]:
init_notebook_mode(all_interactive=True)
results_df

In [None]:
# Calculate and display the mean and standard deviation for each evaluation metric
mean_metrics = results_df[['R2']].mean()
std_metrics = results_df[["R2"]].std()

print("Mean of Evaluation Metrics:")
print(mean_metrics)
print("\nStandard Deviation of Evaluation Metrics:")
print(std_metrics)

* A mean R² of 0.440 means the model explains about 44%   of the variability in asthma mortality among departments where mortality actually occurred.

* The Random Forest regression model trained only on positive asthma mortality cases (CA_bin = 1) achieved a mean R² of 0.440 ± 0.138 across 16 annual test windows (2007–2021). This indicates moderate predictive capacity in estimating asthma mortality rates based on the selected predictors

### RF Regression Model Parameter Tuning

In [None]:
# Define feature list and hyperparameter grid
features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
            'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
            'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_leaf': [1, 2]
}

In [None]:
results = []

for i in range(2006, 2022):  # walk-forward from 2006 to 2021
    train_years = list(range(2001, i + 1))
    test_year = i + 1
    print(f"Training: {train_years[0]}–{train_years[-1]}, Testing: {test_year}")

    # Filter by year
    train_df = df_ts[df_ts['YEAR'].isin(train_years)]
    test_df = df_ts[df_ts['YEAR'] == test_year]

    # Filter to CA_bin == 1
    train_pos = train_df[train_df['CA_bin'] == 1].copy()
    test_pos = test_df[test_df['CA_bin'] == 1].copy()

    # Skip if empty
    if train_pos.empty or test_pos.empty:
        print(f"Skipped: No positive cases in train or test for {test_year}")
        continue

    X_train = train_pos[features]
    y_train = train_pos['CA']
    X_test = test_pos[features]
    y_test = test_pos['CA']

    # Grid search for best model
    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_grid=param_grid,
        scoring='r2',
        cv=3,
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Predict and evaluate
    y_pred = best_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    # Store result
    results.append({
        'train_years': f"{train_years[0]}-{train_years[-1]}",
        'test_year': test_year,
        'R2': r2,
        'Best_Params': grid_search.best_params_
    })

In [None]:
# convert results into a dataframe
resultspt_df = pd.DataFrame(results)

In [None]:
# visualize dataframe
init_notebook_mode(all_interactive=True)
resultspt_df

In [None]:
# Calculate and display the mean and standard deviation for each evaluation metric
mean_metrics = resultspt_df[["R2"]].mean()
std_metrics = resultspt_df[["R2"]].std()

print("Mean of Evaluation Metrics:")
print(mean_metrics)
print("\nStandard Deviation of Evaluation Metrics:")
print(std_metrics)

**Note:** Parameter tuning did not improve the model performance

### Training RF Regression Model on 2001–2021 and Predict 2022

In this section, for the SHAP analysis and mapping, we will train the RF regression model on data from 2001–2021 and predict the 2022 Normalized Asthma Mortality Rate in departments where it is present (CA_bin = 1 or NAMR > 0).

In [None]:
# Define features and target
features = ['PM25', 'NBA', 'PD', 'PDPM25', 'PM25_lag1',
            'PM25_lag2', 'NBA_lag1', 'NBA_lag2', 'PD_lag1', 'PD_lag2',
            'PDPM25_lag1', 'PDPM25_lag2', 'CA_lag1', 'CA_lag2']
target = 'CA'

In [None]:
# Prepare training and test sets
train_df = df_ts[df_ts['YEAR'] <= 2021].dropna(subset=features + [target])
test_df = df_ts[df_ts['YEAR'] == 2022].dropna(subset=features + [target])

In [None]:
# Filter for positive cases (CA_bin == 1)
train_pos = train_df[train_df['CA_bin'] == 1].copy()
test_pos = test_df[test_df['CA_bin'] == 1].copy()

In [None]:
# Define inputs
X_train = train_pos[features]
y_train = train_pos[target]

In [None]:
# Keep IDDPTO and make prediction-ready copy
X_test_full = test_pos[['IDDPTO'] + features + [target]].copy()
X_test = X_test_full[features]
y_test = test_pos[target].reset_index(drop=True)

In [None]:
# Train regression model
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train, y_train)

In [None]:
# Predict using feature columns
y_pred = reg.predict(X_test)

In [None]:
# Store prediction in column CA_pred
X_test_full['CA_pred'] = y_pred

In [None]:
# Evaluate
r2 = r2_score(y_test, y_pred)

In [None]:
# Print evaluation
print("Prediction Results for 2022:")
print(f"R²: {r2:.3f}")

Metric Value	Interpretation:

The Random Forest regression model trained only on positive asthma mortality
cases (CA_bin = 1) achieved a  R² of 0.636. The model explains about 63% of the variability in asthma mortality among departments where mortality actually occurred. This indicates good predictive capacity in estimating asthma mortality rates in 2022.

In [None]:
# select IDDPTO, CA and CA_pred from X_test_full
reg_pred_result = X_test_full[['IDDPTO', 'CA', 'CA_pred']].copy()

In [None]:
# visualize the reults
init_notebook_mode(all_interactive=True)
reg_pred_result

In [None]:
# line chart actual vs predicted NAMR 2022
plt.figure(figsize=(15, 6))
plt.plot(reg_pred_result['IDDPTO'], reg_pred_result['CA'], color='blue', label='Actual NAMR')
plt.plot(reg_pred_result['IDDPTO'], reg_pred_result['CA_pred'], color='red', label='Predicted NAMR')
plt.xlabel('Departments')
plt.ylabel('NAMR Values')
plt.title('Actual vs Predicted NAMR 2022')
plt.xticks([]) # Remove x-axis labels
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

### SHAP (SHapley Additive Explanations)

In this section we will interpret the contribution of each independent variable to the final predition of the RF regression model using the SHAP method

In [None]:
# ---- SHAP for Regression ----
# Use TreeExplainer (optimized for RandomForest)
explainer = shap.TreeExplainer(reg)

# get test set
X = test_pos[features]

# Function to compute SHAP values for a single row
def compute_shap(row):
    # For regression, shap_values returns a single 1D array
    return explainer.shap_values(row)

# Parallel computation (adjust n_jobs as needed)
shap_values_list = Parallel(n_jobs=-1)(
    delayed(compute_shap)(X.iloc[[i]]) for i in range(len(X))
)

# Stack the 1D SHAP value arrays vertically
# No need to index [:, 1] as there's only one set of SHAP values for regression
shap_values = np.vstack(shap_values_list)

In [None]:
# Generate a SHAP summary plot to visualize the impact of features on the model's predictions
shap.summary_plot(shap_values, X, feature_names=features,
                  show= False)
plt.title("(b) SHAP Summary Plot RF Regression", fontsize=17)
plt.xlabel("SHAP Values", size=12)
plt.ylabel("Features", size=12)
plt.show();

## 🌍 Mapping actual vs predicted asthma mortality rate

As explained above, since our dataset contains a high proportion of zero values, we will map the predicted NAMR for 2022 only in departments where the actual values are greater than zero. Departments with an actual NAMR value of zero will retain a predicted value of zero.

### Preparing data for mapping

In [None]:
# from df_ts get filter 2022 and CA_bin equal to 0
dpto_zero = df_ts[df_ts['YEAR'] == 2022]
dpto_zero = dpto_zero[dpto_zero['CA_bin'] == 0]

In [None]:
# keep IDDPTO and CA_bin columns
dpto_zero = dpto_zero[['IDDPTO', 'CA_bin']]

In [None]:
# rename CA_bin to CA and copy the column as CA_pred
dpto_zero.rename(columns={'CA_bin': 'CA'}, inplace=True)
dpto_zero['CA_pred'] = dpto_zero['CA'].copy()

In [None]:
# visualize dataframe
init_notebook_mode(all_interactive=True)
show(dpto_zero)

In [None]:
# get df info
dpto_zero.info()

In [None]:
# concat reg_pred_result with dpto_zero
df_concat = pd.concat([reg_pred_result, dpto_zero])

In [None]:
# get df info
df_concat.info()

In [None]:
# visualize dataframe
init_notebook_mode(all_interactive=True)
show(df_concat)

In [None]:
# Load dataset with data per department
gdf = gpd.read_file("pdt/asthma_mortality/data/gpkg/tma_pm25_ba_pd_pdpm25_2001_2022.gpkg")

In [None]:
# keep IDDPTO and geometry of the gdf
dpto_geom = gdf[['IDDPTO', 'geometry']]

In [None]:
# merge df_concat and dpto_geom by IDDPTO preserve  df_concat data
df_map = pd.merge(df_concat, dpto_geom, on='IDDPTO', how='left')

In [None]:
# rename CA as CA_2022 and CA_pred as CA_2022_PRED
df_map.rename(columns={'CA': 'CA_2022', 'CA_pred': 'CA_2022_PRED'}, inplace=True)

In [None]:
# round CA_2022_PRED to two decimal places
df_map['CA_2022_PRED'] = df_map['CA_2022_PRED'].round(2)

In [None]:
# convert df_map to a gdf
gdf_map = gpd.GeoDataFrame(df_map, geometry='geometry')

In [None]:
# visualize geodataframe
init_notebook_mode(all_interactive=True)
show(gdf_map)

In [None]:
# save df_map as gpkg file
gdf_map.to_file("pdt/asthma_mortality/data/gpkg/results_RF.gpkg", driver="GPKG")

### Calculate classification schema for mapping

We will use [Pysal](https://pysal.org/)'s [mapclassify](https://pysal.org/mapclassify/index.html) library to determine the best classifier for the choropleth map.

We will use the map classifier with the best ACDM (mean Absolute Deviation Around the class Median). In Pysal, ACDM refers to the mean absolute deviation around the class median. It is a measure of a classifier's fit to the data, specifically by evaluating the average distance between each data point and the median value of the assigned class.

In [None]:
# open results_RF.gpkg as a gdf
df_cl = gpd.read_file("pdt/asthma_mortality/data/gpkg/results_RF.gpkg")

In [None]:
# visualize the dataframe
init_notebook_mode(all_interactive=True)
show(df_cl)

In [None]:
# get df basic info
df_cl.info()

In [None]:
# Get the length of the dataframe 'df_cl'
len(df_cl)

In [None]:
# Select data to analize
selected_data = df_cl.loc[:,["CA_2022", "CA_2022_PRED"]]

In [None]:
# Classify the data into 4 quantile groups
q4 = mapclassify.Quantiles(selected_data, k=4)
q4

In [None]:
# Equal Interval Classification
ei5 = mapclassify.EqualInterval(selected_data, k=5)
ei5

In [None]:
# Classify the data into groups based on the head/tail breaks algorithm
ht = mapclassify.HeadTailBreaks(selected_data)
ht

In [None]:
# MaximumBreaks classification method
mb5 = mapclassify.MaximumBreaks(selected_data, k=5)
mb5

In [None]:
# Apply the Standard Deviation and Mean classification method to the selected data.
msd = mapclassify.StdMean(selected_data)
msd

In [None]:
# Apply Fisher-Jenks classification with 5 classes
fj5 = mapclassify.FisherJenks(selected_data, k=5)
fj5

ACDM(mean Absolute Deviation Around the class Median) visualization

In [None]:
# Bunch classifier objects
class5 = q4, ei5, ht, mb5, msd, fj5
# Collect ADCM for each classifier
fits = np.array([c.adcm for c in class5])
# Convert ADCM scores to a DataFrame
adcms = pd.DataFrame(fits)
# Add classifier names
adcms["classifier"] = [c.name for c in class5]
# Add column names to the ADCM
adcms.columns = ["ADCM", "Classifier"]
ax = sns.barplot(
    y="Classifier", x="ADCM", data=adcms, hue= adcms["Classifier"],  legend=False
)

### Create choropleth maps

Two classifiers have the lowest ACDM: FisherJenks and HeadTailBreaks. We'll select FisherJenks as the classifier to create the choropleth maps.

In [None]:
# Convert the bins to a list for further processing
bins = fj5.bins.tolist()
bins

In [None]:
# insert 0 at 0 position
bins.insert(0, 0.0)
bins

In [None]:
# Create a custom classification using UserDefined for actual values
classi_actual = mapclassify.UserDefined(df_cl["CA_2022"], bins)

In [None]:
# Create a custom classification using UserDefined for predicted values
classi_pred = mapclassify.UserDefined(df_cl["CA_2022_PRED"], bins)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 8))
fig.subplots_adjust(hspace=0, wspace=-0.9)
plt.suptitle('Normalized Asthma Mortality Rate 2022', fontsize=14, y=1)

# Plot classi_actual
classi_actual.plot(
    df_cl,
    legend=False,  # We'll build it manually
    axis_on=False,
    border_color='black',
    cmap="viridis_r",
    ax=axes[0]
)

# Plot classi_pred
classi_pred.plot(
    df_cl,
    legend=False,  # We'll build it manually
    axis_on=False,
    border_color='black',
    cmap="viridis_r",
    ax=axes[1]
)


# Custom bin labels and colors
bin_labels = ["0.00", "0.00-0.65", "0.65-2.16", "2.16-4.13", "4.13-7.49", "7.49-14.15"]
n_bins = len(bin_labels)
# cmap = mpl.cm.get_cmap("viridis_r", n_bins)
cmap = mpl.colormaps.get_cmap("viridis_r").resampled(n_bins)
colors = [mpl.colors.to_hex(cmap(i)) for i in range(cmap.N)]

# Create legend patches for bins
bin_patches = [Patch(facecolor=color, edgecolor='black', label=label)
               for color, label in zip(colors, bin_labels)]


# Combine all patches
all_patches = bin_patches

# Display custom legend
#axes[0].legend(handles=all_patches, loc='upper right', bbox_to_anchor=(1.1, 0.4), fontsize=8)
axes[1].legend(handles=all_patches, loc='upper right', bbox_to_anchor=(0.9, 0.25), fontsize=10)

# Set titles
axes[0].set_title('Actual', fontsize=12)
axes[1].set_title('Predicted', fontsize=12)

plt.tight_layout()
plt.show();

## References

1. Utku, A., & Akcayol, M. A. (2024). Spread patterns of COVID-19 in European countries: hybrid deep learning model for prediction and transmission analysis. Neural Computing and Applications, 36(17), 10201-10217.