In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
train_df = pd.read_csv("checklists_env_combined_baea.csv") # changed from ss

# Select features
features = ['year', 'day_of_year', 'hours_of_day', 'latitude', 'longitude',
            'effort_hours', 'effort_distance_km', 'effort_speed_kmph',
            'number_observers'] + \
           [col for col in train_df.columns if col.startswith(('pland_', 'ed_', 'elevation_'))]

X = train_df[features]
y = train_df['species_observed'].astype(int)

In [36]:
print(len(train_df))
train_df.isna().sum()

82784


Unnamed: 0,0
checklist_id,0
observer_id,0
type,0
observation_count,81
species_observed,0
state_code,0
locality_id,0
latitude,0
longitude,0
protocol_type,0


In [37]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.20, random_state=28)


In [38]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid') # binary prediction
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[
        tf.keras.metrics.AUC(name="roc_auc", curve="ROC"),
        tf.keras.metrics.AUC(name="pr_auc", curve="PR"),
        "accuracy"
    ])

model.fit(X_train, y_train, epochs=15, validation_data=(X_val, y_val), batch_size=32)


Epoch 1/15
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.8428 - loss: 0.3751 - pr_auc: 0.4585 - roc_auc: 0.7829 - val_accuracy: 0.8523 - val_loss: 0.3562 - val_pr_auc: 0.5450 - val_roc_auc: 0.8195
Epoch 2/15
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.8655 - loss: 0.3286 - pr_auc: 0.5475 - roc_auc: 0.8332 - val_accuracy: 0.8546 - val_loss: 0.3471 - val_pr_auc: 0.5648 - val_roc_auc: 0.8323
Epoch 3/15
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.8656 - loss: 0.3259 - pr_auc: 0.5804 - roc_auc: 0.8442 - val_accuracy: 0.8570 - val_loss: 0.3395 - val_pr_auc: 0.5832 - val_roc_auc: 0.8387
Epoch 4/15
[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.8664 - loss: 0.3239 - pr_auc: 0.5845 - roc_auc: 0.8476 - val_accuracy: 0.8563 - val_loss: 0.3410 - val_pr_auc: 0.5828 - val_roc_auc: 0.8397
Epoch 5/15
[1m2070/2070[0m 

<keras.src.callbacks.history.History at 0x78751b1d2fd0>

In [39]:
from sklearn.isotonic import IsotonicRegression

# Get raw predicted probabilities
pred_probs = model.predict(X_train).flatten()

# Fit calibration model
cal = IsotonicRegression(out_of_bounds='clip')
cal.fit(pred_probs, y_train)

# Apply to validation
val_probs_raw = model.predict(X_val).flatten()
val_probs_cal = cal.predict(val_probs_raw)

from sklearn.metrics import average_precision_score

# Before calibration
pr_auc_raw = average_precision_score(y_val, val_probs_raw)

# After calibration
pr_auc_cal = average_precision_score(y_val, val_probs_cal)

print(f"PR AUC (raw): {pr_auc_raw:.4f}")
print(f"PR AUC (calibrated): {pr_auc_cal:.4f}")



[1m2070/2070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
[1m518/518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
PR AUC (raw): 0.6040
PR AUC (calibrated): 0.5941


In [45]:
# Create a DataFrame for validation results
val_results = pd.DataFrame({
    'noca_true': y_val.values,
    'noca_pred_prob': val_probs_cal,
})

# Save to CSV
val_results.to_csv('val_preds_noca_no_ss_dnn.csv', index=False)

In [40]:
# Load prediction grid
grid = pd.read_csv("environmental_vars_pred_grid_w_lat_lon.csv")

# Add fixed effort values
import numpy as np
grid = grid.copy()
grid["observation_date"] = pd.to_datetime("2023-01-15")
grid["year"] = grid["observation_date"].dt.year
grid["day_of_year"] = grid["observation_date"].dt.dayofyear
grid["hours_of_day"] = 7.4  # approximate
grid["effort_distance_km"] = 2
grid["effort_hours"] = 1
grid["effort_speed_kmph"] = 2
grid["number_observers"] = 1

# Match column order
X_grid = grid[features]
X_grid_scaled = scaler.transform(X_grid)

# Predict with neural net + calibration
grid_probs_raw = model.predict(X_grid_scaled).flatten()
grid_probs_cal = cal.predict(grid_probs_raw)

# Add predictions back to grid
grid["encounter_rate"] = np.clip(grid_probs_cal, 0, 1)


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [13]:
grid.head()

Unnamed: 0,cell_id,x,y,elevation_mean,elevation_sd,ed_c00_water,pland_c00_water,ed_c01_evergreen_needleleaf,pland_c01_evergreen_needleleaf,ed_c02_evergreen_broadleaf,...,ed_c255_unclassified,pland_c255_unclassified,observation_date,year,day_of_year,hours_of_day,effort_distance_km,effort_hours,effort_speed_kmph,number_observers
0,2,-227623.70167,71016.284889,633.591003,48.028278,0.0,0.0,0.0,0.0,0,...,0,0,2023-01-15,2023,15,7.8,2,1,2,1
1,3,-224632.022042,71016.284889,556.44281,51.946083,0.0,0.0,0.0,0.0,0,...,0,0,2023-01-15,2023,15,7.8,2,1,2,1
2,4,-221640.342413,71016.284889,528.164001,43.113594,0.0,0.0,0.0,0.0,0,...,0,0,2023-01-15,2023,15,7.8,2,1,2,1
3,5,-218648.662784,71016.284889,609.500244,79.01123,0.0,0.0,0.0,0.0,0,...,0,0,2023-01-15,2023,15,7.8,2,1,2,1
4,6,-215656.983155,71016.284889,720.016785,76.634804,0.0,0.0,0.0,0.0,0,...,0,0,2023-01-15,2023,15,7.8,2,1,2,1


In [41]:
# Save for R
grid_output = grid[["cell_id", "x", "y", "encounter_rate"]]
grid_output.to_csv("dnn_grid_preds_baea_no_ss_new.csv", index=False)


In [None]:
threshold = 0.534  # or any value from validation tuning
grid_output["in_range"] = (grid_output["encounter_rate"] > threshold).astype(int)
grid_output.to_csv("junco_nn_predictions.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grid_output["in_range"] = (grid_output["encounter_rate"] > threshold).astype(int)


In [None]:
from sklearn.metrics import matthews_corrcoef, f1_score

threshold = 0.534
val_preds_binary = (val_probs_cal > threshold).astype(int)


In [None]:
mcc = matthews_corrcoef(y_val, val_preds_binary)
f1 = f1_score(y_val, val_preds_binary)

print(f"MCC: {mcc:.3f}")
print(f"F1 Score: {f1:.3f}")


MCC: 0.442
F1 Score: 0.634


In [None]:

from sklearn.metrics import matthews_corrcoef, f1_score

best_mcc, best_f1, best_thresh = -1, -1, 0
for t in np.linspace(0, 1, 100):
    preds = (val_probs_cal > t).astype(int)
    m = matthews_corrcoef(y_val, preds)
    f = f1_score(y_val, preds)
    if m > best_mcc:
        best_mcc = m
        best_f1 = f
        best_thresh = t

print(f"Best threshold: {best_thresh:.3f}, MCC: {best_mcc:.3f}, F1: {best_f1:.3f}")


Best threshold: 0.404, MCC: 0.324, F1: 0.635


In [None]:
results_df = pd.DataFrame({
    'obs': y_val,                    # true labels
    'pred': val_probs_cal,            # calibrated predicted probabilities
    'pred_binary': (val_probs_cal > best_thresh).astype(int)  # predicted labels
})
results_df.to_csv("dnn_preds2.csv", index=False)
