# Full script from loading the dataset to model training.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
from keras.preprocessing.sequence import pad_sequences
from itertools import product
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
import random
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences

import sys
sys.path.append('./src')

from data_utils import load_fish_csvs, merge_and_parse_timestamps
from feature_utils import (
    select_frequency_columns,
    reduce_features_by_variance_and_correlation,
    get_top_features_by_random_forest,
    apply_pca
)

from training_utils import (
    evaluate_lofo_models, 
    evaluate_lofo_xgboost_multi, 
    tune_xgboost_with_cv_multi, 
    evaluate_lofo_xgboost_smote, 
    evaluate_lofo_rf_smote,
    evaluate_lstm_lofo_kfold
)


2025-03-30 18:45:58.275594: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# # Reloads the training_utils module to pick up code changes.
# import importlib
# import training_utils

# importlib.reload(training_utils)

## Load the datasets

In [3]:
# List of data files containing fish 
df_list = ['LT009.csv', 'LT010.csv', 'LT011.csv', 
           'LT012.csv', 'LT014.csv', 'LT016.csv', 'LT017.csv', 
           'LT018.csv', 'LT021.csv', 'SMB001.csv', 'SMB002.csv', 
           'SMB005.csv', 'SMB006.csv', 'SMB007.csv',
           'SMB011.csv', 'SMB012.csv']
# Load all CSV files into separate DataFrames
dataframes = load_fish_csvs(df_list)

In [4]:
# Merge timestamps
df = pd.concat(dataframes.values(), ignore_index=True)
df = merge_and_parse_timestamps(df) 

# Frequency selection
f_number_cols = select_frequency_columns(df)
df_filtered = df[["fishNum", "species", "dateProcessed", "Ping_time"] + f_number_cols].copy()
df_filtered['species_label'] = df_filtered['species'].astype('category').cat.codes

# Feature reduction
selected_features, corr_matrix = reduce_features_by_variance_and_correlation(df_filtered, f_number_cols)

# RF selection
top_features_rf = get_top_features_by_random_forest(df_filtered, list(selected_features), 'species_label')

# Standardize for PCA
X = df_filtered[top_features_rf]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
X_pca, pca_variance = apply_pca(X_scaled)


# Logistic Regression

In [5]:
# Prepare features for LOFO evaluation
X_rf_selected = df_filtered[top_features_rf]

# Define models
models = {
    # Logistic Regression with RF-selected features
    "LogReg_RF": LogisticRegression(
        class_weight="balanced",
        max_iter=1000, 
        random_state=42,
        solver='saga'  
    ),
    
    # Logistic Regression with PCA features
    "LogReg_PCA": LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42,
        penalty='l2',  
        C=0.1  # Stronger regularization for PCA features
    ),
}

# Evaluate
print("Running LOFO evaluation...")
results, summary_df = evaluate_lofo_models(df_filtered, X_rf_selected, X_pca, models)

# Display results
print("\nModel Performance Summary:")
print(summary_df.round(3))  # Round to 3 decimal places


Running LOFO evaluation...

Model Performance Summary:
            Mean Accuracy
LogReg_RF           0.662
LogReg_PCA          0.661


# XGB Model

In [6]:
feature_sets = {
    # Random Forest selected features (original feature space)
    "XGB_RF_Selected": X_rf_selected,
    
    # PCA-transformed features 
    "XGB_PCA_Transformed": pd.DataFrame(X_pca, index=df_filtered.index)
}

# Evaluate
print("Starting XGBoost LOFO evaluation...")
results_xgb_multi, summary_xgb_multi = evaluate_lofo_xgboost_multi(
    df_filtered, 
    feature_sets, 
    groups=df_filtered['fishNum']  
)

# Display results
print("\nXGBoost Model Comparison:")
print(summary_xgb_multi.round(3)) 


Starting XGBoost LOFO evaluation...

XGBoost Model Comparison:
                     Mean Accuracy
XGB_RF_Selected              0.562
XGB_PCA_Transformed          0.550


### Hyperparameter tuning for XGBoost

In [7]:
feature_sets = {
    "XGB_RF_Selected": X_rf_selected,
    "XGB_PCA_Transformed": pd.DataFrame(X_pca, index=df_filtered.index)
}

print("Starting XGBoost hyperparameter tuning...")
summary_df_xgb_tuning = tune_xgboost_with_cv_multi(
    df_filtered, 
    feature_sets, 
    df_filtered['species_label']  
)

# Display results
print("\nXGBoost Tuning Results:")
print(summary_df_xgb_tuning[["Model Name", "Best Accuracy"]].round(3).sort_values("Best Accuracy", ascending=False))


Starting XGBoost hyperparameter tuning...
Tuning XGBoost for: XGB_RF_Selected
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuning XGBoost for: XGB_PCA_Transformed
Fitting 5 folds for each of 20 candidates, totalling 100 fits

XGBoost Tuning Results:
            Model Name  Best Accuracy
1  XGB_PCA_Transformed          0.723
0      XGB_RF_Selected          0.721


### Use SMOTE to balance data
SMOTE creates synthetic samples to balance class distribution
#### XGBOOST


In [8]:
# Evaluate XGBoost with SMOTE balancing
summary_df_smote = evaluate_lofo_xgboost_smote(
    df_filtered,         
    top_features_rf      # Best features from Random Forest selection
)
print("XGBoost with SMOTE Results:")
print(summary_df_smote.round(3))  

XGBoost with SMOTE Results:
          Model Name  Mean Accuracy
0  XGB_RF_Stat+SMOTE          0.688


#### RF

In [9]:
# Evaluate Random Forest with SMOTE balancing
summary_df_rf_smote = evaluate_lofo_rf_smote(
    df_filtered,         
    top_features_rf      
)
print("\nRandom Forest with SMOTE Results:")
print(summary_df_rf_smote.round(3))


Random Forest with SMOTE Results:
         Model Name  Mean LOFO Accuracy
0  RF_RF_Stat+SMOTE               0.688


## ML models performed not well, they did not incorporate time series nature in this dataset. We need to consider using models designed for time-series.
Besides, leave one fish out makes the test set to have only one species. It's better to leave a pair of fish out instead.

### LSTM

In [10]:
# Sort by fish and time to maintain sequence order
df_lstm = df_filtered.sort_values(by=["fishNum", "Ping_time"])

# Select frequency feature columns
frequency_cols = df_lstm.columns[4:-1]

# Normalize the frequency features
scaler = StandardScaler()
df_lstm[frequency_cols] = scaler.fit_transform(df_lstm[frequency_cols])

# Group by fish to create sequences
grouped = df_lstm.groupby("fishNum")

fish_sequences = []
fish_labels = []
fish_nums = []

for fish_id, group in grouped:
    # Store sequence with shape (timesteps, features)
    fish_sequences.append(group[frequency_cols].values)
    fish_labels.append(group["species"].iloc[0])
    fish_nums.append(fish_id)

fish_sequences = np.array(fish_sequences, dtype=object)
fish_labels = np.array(fish_labels)

# Pad to same length
max_timesteps = max([seq.shape[0] for seq in fish_sequences])
fish_sequences_padded = pad_sequences(fish_sequences, maxlen=max_timesteps, dtype="float32", padding="post", truncating="post")

# Encode labels
species_mapping = {species: idx for idx, species in enumerate(np.unique(fish_labels))}
fish_labels_encoded = np.array([species_mapping[label] for label in fish_labels])

# LOPO pair generation
lt_fish = [fish for fish, label in zip(fish_nums, fish_labels_encoded) if label == 0]
smb_fish = [fish for fish, label in zip(fish_nums, fish_labels_encoded) if label == 1]
lopo_pairs = list(product(lt_fish, smb_fish))
random.seed(42)
random_lopo_pairs = random.sample(lopo_pairs, 5)

# Define input shape
input_shape = (max_timesteps, fish_sequences_padded.shape[2])



In [11]:
summary_df_lstm = evaluate_lstm_lofo_kfold(
    fish_sequences_padded=fish_sequences_padded,
    fish_labels_encoded=fish_labels_encoded,
    fish_nums=fish_nums,
    lopo_pairs=lopo_pairs,
    random_lopo_pairs=random_lopo_pairs,
    input_shape=input_shape,
    use_all_pairs=False  
)
print("\nLSTM Performance Summary:")
print(summary_df_lstm.round(3).T)


Running LOPO on 5 pairs...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 987ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 917ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 862ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 943ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 905ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 828ms/step

LSTM Performance Summary:
                         0
Model Name            LSTM
Mean LOPO Accuracy     0.8
Mean K-Fold Accuracy   0.8


In [None]:
# # For all pairs
# summary_df_lstm = evaluate_lstm_lofo_kfold(
#     fish_sequences_padded=fish_sequences_padded,
#     fish_labels_encoded=fish_labels_encoded,
#     fish_nums=fish_nums,
#     lopo_pairs=lopo_pairs,
#     random_lopo_pairs=random_lopo_pairs,
#     input_shape=input_shape,
#     use_all_pairs=True 
# )
# print("\nLSTM Performance Summary:")
# print(summary_df_lstm.round(3).T)


Running LOPO on 63 pairs...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 736ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 749ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 717ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 777ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 738ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 702ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 775ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 824ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 877ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 875ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 942ms/step
[1m1/1[0m [32m━━━━━━━━━