In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import os

import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score

# suppress runtime warnings
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
# config
BATCH_SIZE = 256
MODEL_DIR = 'models'
SAVE_FIGS = True     # whether to save figures folder

# make models directory
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
model_save_path = os.path.join(MODEL_DIR, 'mlp_best_model.pth')

# seed
np.random.seed(2025)
SHAP_waterfall_ct = 3 # 3 random samples

# make figures and shap directories
if SAVE_FIGS and not os.path.exists('figures'):
    os.makedirs('figures')
if SAVE_FIGS and not os.path.exists('figures/shap'):
    os.makedirs('figures/shap')
if SAVE_FIGS and not os.path.exists('figures/lime'):
    os.makedirs('figures/lime')

In [4]:
data = pd.read_csv('data/pok_2010_2025_augmented.csv')
data['date'] = pd.to_datetime(data['date'])

# make pok as ratio greater than 0.1
data['pok'] = (data['pok_ratio'] > 0.1).astype(int)
print(f"Class distribution:\n{data['pok'].value_counts(normalize=True)}")

val_splitday = pd.to_datetime('2022-12-31')
test_splitday = pd.to_datetime('2023-12-31')

# y-ratio vector associated with X_test
y_test_ratio = data[data['date'] > test_splitday]['pok_ratio'].reset_index(drop=True)

# prepare data
data = data.drop(columns=['species', 'gear', 'weight', 'pok_ratio'])

data_train = data[data['date'] <= val_splitday]
data_val = data[(data['date'] > val_splitday) & (data['date'] <= test_splitday)]
data_test = data[data['date'] > test_splitday]

X_train = data_train.drop(columns=['pok', 'date']).reset_index(drop=True)
y_train = data_train['pok'].reset_index(drop=True)
X_val = data_val.drop(columns=['pok', 'date']).reset_index(drop=True)
y_val = data_val['pok'].reset_index(drop=True)
X_test = data_test.drop(columns=['pok', 'date']).reset_index(drop=True)
y_test = data_test['pok'].reset_index(drop=True)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}, Test samples: {len(X_test)}")
X_test.head()

Class distribution:
pok
0    0.78354
1    0.21646
Name: proportion, dtype: float64
Training samples: 1016881, Validation samples: 65776, Test samples: 108489


Unnamed: 0,latitude,longitude,depth,thetao,uo,vo,so,thetao_grad,chl,no3,nppv,o2,po4,si,day_cos,day_sin
0,64.9758,-24.0175,246.6252,5.228278,-0.043947,0.062258,35.019989,3.897163,0.069659,7.836014,0.01719,287.540741,0.58097,3.890184,0.999408,0.034398
1,64.9185,-24.0353,108.75152,5.502213,0.06531,0.211188,35.065769,6.025003,0.069659,7.836014,0.01719,287.540741,0.58097,3.890184,0.999408,0.034398
2,64.8673,-24.0747,64.917168,5.178473,-0.001831,0.319834,35.027618,4.765457,0.081872,6.923312,0.052279,292.967468,0.527286,3.415034,0.999408,0.034398
3,64.9198,-24.018,80.46536,5.502213,0.06531,0.211188,35.065769,6.025003,0.069659,7.836014,0.01719,287.540741,0.58097,3.890184,0.999408,0.034398
4,63.5857,-20.4557,41.604976,6.356242,0.012207,0.001221,34.681232,3.200496,0.093789,5.887146,0.0622,286.737061,0.464209,3.390041,0.999408,0.034398


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)  # only transform
X_test_scaled = scaler.transform(X_test) # only transform

# # make dataframes again (for feature selection later)
# X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
# X_val_df   = pd.DataFrame(X_val_scaled,   columns=X_val.columns,   index=X_val.index)
# X_test_df  = pd.DataFrame(X_test_scaled,  columns=X_test.columns,  index=X_test.index)

# make tensors
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_t = torch.tensor(X_val_scaled, dtype=torch.float32)
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_val_t = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
y_test_t = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# make dataset
trainset = TensorDataset(X_train_t, y_train_t)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

# make val dataset
valset = TensorDataset(X_val_t, y_val_t)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)


In [5]:
# fit logistic regression model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_scaled, y_train) 
print(f"Logistic Regression training accuracy: {logreg.score(X_train_scaled, y_train):.4f}")
print(f"Logistic Regression test accuracy: {logreg.score(X_test_scaled, y_test):.4f}")

# roc auc
y_test_probs_logreg = logreg.predict_proba(X_test_scaled)[:, 1]
test_roc_auc_logreg = roc_auc_score(y_test, y_test_probs_logreg)
print(f"Logistic Regression test ROC AUC: {test_roc_auc_logreg:.4f}")


Logistic Regression training accuracy: 0.7956
Logistic Regression test accuracy: 0.8001
Logistic Regression test ROC AUC: 0.8244


In [9]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=2025)
rf_model.fit(X_train_scaled, y_train)
print(f"Random Forest training accuracy: {rf_model.score(X_train_scaled, y_train):.4f}")
print(f"Random Forest test accuracy: {rf_model.score(X_test_scaled, y_test):.4f}")

# roc auc
y_test_probs_rf = rf_model.predict_proba(X_test_scaled)[:, 1]
test_roc_auc_rf = roc_auc_score(y_test, y_test_probs_rf)
print(f"Random Forest test ROC AUC: {test_roc_auc_rf:.4f}")

Random Forest training accuracy: 1.0000
Random Forest test accuracy: 0.8607
Random Forest test ROC AUC: 0.9075


In [17]:
# XGBoost model
import xgboost as xgb
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=2025)
xgb_model.fit(X_train_scaled, y_train)
print(f"XGBoost training accuracy: {xgb_model.score(X_train_scaled, y_train):.4f}")
print(f"XGBoost test accuracy: {xgb_model.score(X_test_scaled, y_test):.4f}")
# roc auc
y_test_probs_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]
test_roc_auc_xgb = roc_auc_score(y_test, y_test_probs_xgb)
print(f"XGBoost test ROC AUC: {test_roc_auc_xgb:.4f}")

XGBoost training accuracy: 0.8850
XGBoost test accuracy: 0.8613
XGBoost test ROC AUC: 0.9077


In [18]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
print(f"KNN training accuracy: {knn_model.score(X_train_scaled, y_train):.4f}")
print(f"KNN test accuracy: {knn_model.score(X_test_scaled, y_test):.4f}")
# roc auc
y_test_probs_knn = knn_model.predict_proba(X_test_scaled)[:, 1]
test_roc_auc_knn = roc_auc_score(y_test, y_test_probs_knn)
print(f"KNN test ROC AUC: {test_roc_auc_knn:.4f}")

KNN training accuracy: 0.9133
KNN test accuracy: 0.8272
KNN test ROC AUC: 0.8280


# NN training (with adding features)

In [7]:
class PollockClassifier(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64,1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

loss_function = nn.BCELoss()

In [10]:
results = []   # to store results: (num_features, used_feature_names, accuracy, auc)

selected_features = []  # add features step by step
feature_steps = [
    ['latitude'],
    ['longitude'],
    ['depth'],
    ['po4'],
    ['no3'],
    ['thetao'],
    ['day_cos', 'day_sin'],  
    ['o2'],
    ['so'],
    ['si'],
    ['chl'],
    ['vo', 'uo'],
    ['thetao_grad'],
    ['nppv']
]


for step_features in feature_steps:
    selected_features.extend(step_features)
    print("Using:", selected_features)

    # subset data to selected features
    X_train_subset = X_train[selected_features]
    X_val_subset = X_val[selected_features]
    X_test_subset = X_test[selected_features]

    # scale the data
    subscaler = StandardScaler()
    X_train_k_scaled = subscaler.fit_transform(X_train_subset)
    X_val_k_scaled = subscaler.transform(X_val_subset)
    X_test_k_scaled = subscaler.transform(X_test_subset)

    X_train_k = torch.tensor(X_train_k_scaled, dtype=torch.float32)
    X_val_k   = torch.tensor(X_val_k_scaled,   dtype=torch.float32)
    X_test_k  = torch.tensor(X_test_k_scaled,  dtype=torch.float32)
    trainloader = DataLoader(TensorDataset(X_train_k, y_train_t), batch_size=BATCH_SIZE, shuffle=True)
    valloader   = DataLoader(TensorDataset(X_val_k,   y_val_t),   batch_size=BATCH_SIZE, shuffle=False)

    # new model for current feature set
    model = PollockClassifier(in_features=len(selected_features))
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

    # early stopping parameters
    best_val_loss = float("inf")
    patience = 5
    counter = 0

    train_losses = []
    val_losses = []
    val_accuracies = []

    # training loop
    for epoch in range(50):
        model.train()
        total_loss = 0
        for xb, yb in trainloader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_function(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * xb.size(0)

        avg_train_loss = total_loss / len(trainloader.dataset)
        train_losses.append(avg_train_loss)

        # validation loop
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for xb, yb in valloader:
                val_preds = model(xb)
                loss = loss_function(val_preds, yb)
                val_loss += loss.item() * xb.size(0)
                val_preds = (val_preds > 0.5).float()
                correct += (val_preds == yb).sum().item()
                total += yb.size(0)

        avg_val_loss = val_loss / len(valloader.dataset)
        val_losses.append(avg_val_loss)

        val_accuracy = correct / total
        val_accuracies.append(val_accuracy)

        #print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

        # early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0
            best_model_state = model.state_dict()

        else:
            counter += 1

        if counter >= patience:
            print("Early stopping triggered at epoch", epoch+1, "after", patience, "epochs without improvement.")
            print(f"Best Val Loss: {best_val_loss:.4f}")
            break

    # load best model state
    model.load_state_dict(best_model_state)

    # evaluate on test set
    model.eval()
    with torch.no_grad():
        test_probs = model(X_test_k).numpy()

    acc = accuracy_score(y_test_t.numpy(), test_probs > 0.5)
    auc = roc_auc_score(y_test_t.numpy(), test_probs)

    print(f"#Features={len(selected_features)} ({selected_features}): Accuracy={acc:.3f}, AUC={auc:.3f}")

    results.append((len(selected_features), step_features, acc, auc))

Using: ['latitude']
Early stopping triggered at epoch 13 after 5 epochs without improvement.
Best Val Loss: 0.4256
#Features=1 (['latitude']): Accuracy=0.801, AUC=0.751
Using: ['latitude', 'longitude']
Early stopping triggered at epoch 21 after 5 epochs without improvement.
Best Val Loss: 0.3312
#Features=2 (['latitude', 'longitude']): Accuracy=0.840, AUC=0.886
Using: ['latitude', 'longitude', 'depth']
Early stopping triggered at epoch 12 after 5 epochs without improvement.
Best Val Loss: 0.3124
#Features=3 (['latitude', 'longitude', 'depth']): Accuracy=0.846, AUC=0.892
Using: ['latitude', 'longitude', 'depth', 'po4']
Early stopping triggered at epoch 29 after 5 epochs without improvement.
Best Val Loss: 0.3089
#Features=4 (['latitude', 'longitude', 'depth', 'po4']): Accuracy=0.845, AUC=0.895
Using: ['latitude', 'longitude', 'depth', 'po4', 'no3']
Early stopping triggered at epoch 33 after 5 epochs without improvement.
Best Val Loss: 0.3057
#Features=5 (['latitude', 'longitude', 'depth

In [11]:
results

[(1, ['latitude'], 0.8005143378591378, 0.7509501352118525),
 (2, ['longitude'], 0.840426218326282, 0.8858497173944955),
 (3, ['depth'], 0.8456249020638037, 0.8920457040200631),
 (4, ['po4'], 0.8446847145793582, 0.8947035744786854),
 (5, ['no3'], 0.8515978578473393, 0.8957560120649195),
 (6, ['thetao'], 0.8427490344643236, 0.8885551955803208),
 (8, ['day_cos', 'day_sin'], 0.8602070255970652, 0.9045062950315929),
 (9, ['o2'], 0.8541326770455991, 0.8976264347843794),
 (10, ['so'], 0.8565107983297846, 0.8976249760802103),
 (11, ['si'], 0.8527684834407175, 0.8973505134557055),
 (12, ['chl'], 0.851975776345989, 0.8963875746012786),
 (14, ['vo', 'uo'], 0.8545105955442487, 0.8984997593635493),
 (15, ['thetao_grad'], 0.8529804865009356, 0.897780443745205),
 (16, ['nppv'], 0.8532293596585829, 0.8967922427056016)]