In [None]:
# FOR MULTIPLE SPECIES

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
train_df = pd.read_csv("checklists_env_deju_baea_noca.csv") # changed from ss

# Select features
features = ['year', 'day_of_year', 'hours_of_day', 'latitude', 'longitude',
            'effort_hours', 'effort_distance_km', 'effort_speed_kmph',
            'number_observers'] + \
           [col for col in train_df.columns if col.startswith(('pland_', 'ed_', 'elevation_'))]

X = train_df[features]
# Prepare y as two-column array
y = train_df[['deju_observed', 'baea_observed', 'noca_observed']].astype(int)



In [None]:
train_df.isna().sum()

Unnamed: 0,0
checklist_id,0
observer_id,0
type,0
deju_observation_count,423
deju_observed,0
state_code,0
locality_id,0
latitude,0
longitude,0
protocol_type,0


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.20, random_state=28)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(3, activation='sigmoid')  # two outputs
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.AUC(name="roc_auc", curve="ROC"),
        tf.keras.metrics.AUC(name="pr_auc", curve="PR"),
        "accuracy"
    ]
)
model.fit(X_train, y_train, epochs=20, validation_data=(X_val, y_val), batch_size=32)



Epoch 1/20
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.2484 - loss: 0.5209 - pr_auc: 0.6946 - roc_auc: 0.7975 - val_accuracy: 0.2944 - val_loss: 0.4919 - val_pr_auc: 0.7358 - val_roc_auc: 0.8264
Epoch 2/20
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.3153 - loss: 0.4832 - pr_auc: 0.7406 - roc_auc: 0.8316 - val_accuracy: 0.3470 - val_loss: 0.4842 - val_pr_auc: 0.7436 - val_roc_auc: 0.8326
Epoch 3/20
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.3403 - loss: 0.4703 - pr_auc: 0.7546 - roc_auc: 0.8421 - val_accuracy: 0.3039 - val_loss: 0.4751 - val_pr_auc: 0.7536 - val_roc_auc: 0.8385
Epoch 4/20
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.3525 - loss: 0.4603 - pr_auc: 0.7680 - roc_auc: 0.8496 - val_accuracy: 0.3968 - val_loss: 0.4723 - val_pr_auc: 0.7576 - val_roc_auc: 0.8418
Epoch 5/20
[1m2070/2070[0m [3

<keras.src.callbacks.history.History at 0x7c267e927850>

In [None]:

from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import average_precision_score

# Get raw predicted probabilities for training set
pred_probs_train = model.predict(X_train)

# Fit calibration models separately for DEJU and BAEA
cal_deju = IsotonicRegression(out_of_bounds='clip')
cal_deju.fit(pred_probs_train[:, 0], y_train['deju_observed'])

cal_baea = IsotonicRegression(out_of_bounds='clip')
cal_baea.fit(pred_probs_train[:, 1], y_train['baea_observed'])

cal_noca = IsotonicRegression(out_of_bounds='clip')
cal_noca.fit(pred_probs_train[:, 2], y_train['noca_observed'])

# Get raw predicted probabilities for validation set
pred_probs_val = model.predict(X_val)
val_probs_deju_raw = pred_probs_val[:, 0]
val_probs_baea_raw = pred_probs_val[:, 1]
val_probs_noca_raw = pred_probs_val[:, 2]

# Apply calibrations
val_probs_deju_cal = cal_deju.predict(val_probs_deju_raw)
val_probs_baea_cal = cal_baea.predict(val_probs_baea_raw)
val_probs_noca_cal = cal_noca.predict(val_probs_noca_raw)

# Calculate PR AUCs
pr_auc_deju_raw = average_precision_score(y_val['deju_observed'], val_probs_deju_raw)
pr_auc_deju_cal = average_precision_score(y_val['deju_observed'], val_probs_deju_cal)

pr_auc_baea_raw = average_precision_score(y_val['baea_observed'], val_probs_baea_raw)
pr_auc_baea_cal = average_precision_score(y_val['baea_observed'], val_probs_baea_cal)

pr_auc_noca_raw = average_precision_score(y_val['noca_observed'], val_probs_noca_raw)
pr_auc_noca_cal = average_precision_score(y_val['noca_observed'], val_probs_noca_cal)

# Print results
print(f"DEJU PR AUC (raw): {pr_auc_deju_raw:.4f}")
print(f"DEJU PR AUC (calibrated): {pr_auc_deju_cal:.4f}")
print(f"BAEA PR AUC (raw): {pr_auc_baea_raw:.4f}")
print(f"BAEA PR AUC (calibrated): {pr_auc_baea_cal:.4f}")
print(f"NOCA PR AUC (raw): {pr_auc_noca_raw:.4f}")
print(f"NOCA PR AUC (calibrated): {pr_auc_noca_cal:.4f}")


[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 694us/step
[1m518/518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 694us/step
DEJU PR AUC (raw): 0.7413
DEJU PR AUC (calibrated): 0.7353
BAEA PR AUC (raw): 0.6017
BAEA PR AUC (calibrated): 0.5919
NOCA PR AUC (raw): 0.8371
NOCA PR AUC (calibrated): 0.8333


In [None]:

# Create a DataFrame for validation results
val_results = pd.DataFrame({
    'deju_true': y_val['deju_observed'].values,
    'baea_true': y_val['baea_observed'].values,
    'noca_true': y_val['noca_observed'].values,
    'deju_pred_prob': val_probs_deju_cal,
    'baea_pred_prob': val_probs_baea_cal,
    'noca_pred_prob': val_probs_noca_cal
})

# Save to CSV
val_results.to_csv('val_preds_deju_baea_noca.csv', index=False)



In [None]:
# # Load prediction grid
# grid = pd.read_csv("environmental_vars_pred_grid_w_lat_lon.csv")

# # Add fixed effort values
# import numpy as np
# grid = grid.copy()
# grid["observation_date"] = pd.to_datetime("2023-01-15")
# grid["year"] = grid["observation_date"].dt.year
# grid["day_of_year"] = grid["observation_date"].dt.dayofyear
# grid["hours_of_day"] = 7.8  # approximate
# grid["effort_distance_km"] = 2
# grid["effort_hours"] = 1
# grid["effort_speed_kmph"] = 2
# grid["number_observers"] = 1

# # Match column order
# X_grid = grid[features]
# X_grid_scaled = scaler.transform(X_grid)

# # Predict with neural net + calibration
# grid_probs_raw = model.predict(X_grid_scaled).flatten()
# grid_probs_cal = cal.predict(grid_probs_raw)

# # Add predictions back to grid
# grid["encounter_rate"] = np.clip(grid_probs_cal, 0, 1)


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [None]:
# # Save for R
# grid_output = grid[["cell_id", "x", "y", "encounter_rate"]]
# grid_output.to_csv("dnn_grid_preds_w_lat_lon.csv", index=False)
