# Full script from loading the dataset to model training.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import re
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import scipy.stats as stats
from keras.preprocessing.sequence import pad_sequences
from itertools import product
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
import random
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.sequence import pad_sequences

import sys
sys.path.append('./src')

from data_utils import load_fish_csvs
from feature_utils import (
    select_frequency_columns,
    reduce_features_by_variance_and_correlation,
    get_top_features_by_random_forest,
    apply_pca
)

from training_utils import (
    evaluate_lofo_models, 
    evaluate_lofo_xgboost_multi, 
    tune_xgboost_with_cv_multi, 
    evaluate_lofo_xgboost_smote, 
    evaluate_lofo_rf_smote,
    evaluate_lstm_lofo_kfold
)


2025-03-26 16:30:19.267050: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Functions available: ['Dict', 'List', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'load_fish_csvs', 'merge_and_parse_timestamps', 'pd']


In [2]:
import importlib
import training_utils

importlib.reload(training_utils)

<module 'training_utils' from '/Users/scarlett/Documents/MScAC/STA2453/Code/notebooks/./src/training_utils.py'>

## Load the datasets

In [3]:
df_list = ['LT009.csv', 'LT010.csv', 'LT011.csv', 
           'LT012.csv', 'LT014.csv', 'LT016.csv', 'LT017.csv', 
           'LT018.csv', 'LT021.csv', 'SMB001.csv', 'SMB002.csv', 
           'SMB005.csv', 'SMB006.csv', 'SMB007.csv',
           'SMB011.csv', 'SMB012.csv']

dataframes = load_fish_csvs(df_list)

In [4]:
# Merge timestamps
df = pd.concat(dataframes.values(), ignore_index=True)
# Convert 'dateProcessed' to datetime
df["dateProcessed"] = pd.to_datetime(df["dateProcessed"])

# Convert 'Ping_time' string to time
df["Ping_time"] = pd.to_datetime(df["Ping_time"].str.strip(), format="%H:%M:%S.%f").dt.time

# Combine into full datetime
df["Ping_time"] = df.apply(
    lambda row: pd.Timestamp.combine(row["dateProcessed"], row["Ping_time"]), axis=1
)


# Frequency selection
f_number_cols = select_frequency_columns(df)
df_filtered = df[["fishNum", "species", "dateProcessed", "Ping_time"] + f_number_cols]
df_filtered = df_filtered.copy()
df_filtered['species_label'] = df_filtered['species'].astype('category').cat.codes

# Feature reduction
selected_features, corr_matrix = reduce_features_by_variance_and_correlation(df_filtered, f_number_cols)

# RF selection
top_features_rf = get_top_features_by_random_forest(df_filtered, list(selected_features), 'species_label')

# Standardize for PCA
X = df_filtered[top_features_rf]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
X_pca, pca_variance = apply_pca(X_scaled)


# Logistic Regression

In [5]:
# Prepare features for LOFO evaluation
X_rf_selected = df_filtered[top_features_rf]

# Define models
models = {
    "RandomForest_Selected": LogisticRegression(class_weight="balanced", max_iter=500, random_state=42),
    "PCA_Transformed": LogisticRegression(class_weight="balanced", max_iter=500, random_state=42)
}

# Evaluate
results, summary_df = evaluate_lofo_models(df_filtered, X_rf_selected, X_pca, models)
print(summary_df)


                       Mean Accuracy
RandomForest_Selected       0.658490
PCA_Transformed             0.663851


# XGB Model

In [7]:
feature_sets = {
    "XGB_RF_Selected": X_rf_selected,
    "XGB_PCA_Transformed": pd.DataFrame(X_pca, index=df_filtered.index)
}

results_xgb_multi, summary_xgb_multi = evaluate_lofo_xgboost_multi(df_filtered, feature_sets, df_filtered['fishNum'])
print(summary_xgb_multi)


                     Mean Accuracy
XGB_RF_Selected           0.561543
XGB_PCA_Transformed       0.579742


### Hyperparameter tuning for XGBoost

In [8]:
feature_sets = {
    "XGB_RF_Selected": X_rf_selected,
    "XGB_PCA_Transformed": pd.DataFrame(X_pca, index=df_filtered.index)
}

summary_df_xgb_tuning = tune_xgboost_with_cv_multi(df_filtered, feature_sets, df_filtered['species_label'])
print(summary_df_xgb_tuning)


Tuning XGBoost for: XGB_RF_Selected
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Tuning XGBoost for: XGB_PCA_Transformed
Fitting 5 folds for each of 20 candidates, totalling 100 fits
            Model Name  Best Accuracy  \
0      XGB_RF_Selected       0.718488   
1  XGB_PCA_Transformed       0.728677   

                                     Best Parameters  
0  {'subsample': 0.6, 'n_estimators': 500, 'max_d...  
1  {'subsample': 1.0, 'n_estimators': 500, 'max_d...  


### Use SMOTE to balance data
#### XGBOOST

In [9]:
summary_df_smote = evaluate_lofo_xgboost_smote(df_filtered, top_features_rf)
print(summary_df_smote)

          Model Name  Mean Accuracy
0  XGB_RF_Stat+SMOTE         0.6875


#### RF

In [10]:
summary_df_rf_smote = evaluate_lofo_rf_smote(df_filtered, top_features_rf)
print(summary_df_rf_smote)


         Model Name  Mean LOFO Accuracy
0  RF_RF_Stat+SMOTE                0.75


## ML models performed not well, they did not incorporate time series nature in this dataset. We need to consider using models designed for time-series.
Besides, leave one fish out makes the test set to have only one species. It's better to leave a pair of fish out instead.

### LSTM

In [11]:
# Sort by fish and time to maintain sequence order
df_lstm = df_filtered.sort_values(by=["fishNum", "Ping_time"])

# Select frequency feature columns
frequency_cols = df_lstm.columns[4:-1]

# Normalize the frequency features
scaler = StandardScaler()
df_lstm[frequency_cols] = scaler.fit_transform(df_lstm[frequency_cols])

# Group by fish to create sequences
grouped = df_lstm.groupby("fishNum")

fish_sequences = []
fish_labels = []
fish_nums = []

for fish_id, group in grouped:
    fish_sequences.append(group[frequency_cols].values)
    fish_labels.append(group["species"].iloc[0])
    fish_nums.append(fish_id)

fish_sequences = np.array(fish_sequences, dtype=object)
fish_labels = np.array(fish_labels)

# Pad to same length
max_timesteps = max([seq.shape[0] for seq in fish_sequences])
fish_sequences_padded = pad_sequences(fish_sequences, maxlen=max_timesteps, dtype="float32", padding="post", truncating="post")

# Encode labels
species_mapping = {species: idx for idx, species in enumerate(np.unique(fish_labels))}
fish_labels_encoded = np.array([species_mapping[label] for label in fish_labels])

# LOPO pair generation
lt_fish = [fish for fish, label in zip(fish_nums, fish_labels_encoded) if label == 0]
smb_fish = [fish for fish, label in zip(fish_nums, fish_labels_encoded) if label == 1]
lopo_pairs = list(product(lt_fish, smb_fish))
random.seed(42)
random_lopo_pairs = random.sample(lopo_pairs, 5)

# Define input shape
input_shape = (max_timesteps, fish_sequences_padded.shape[2])



In [12]:
summary_df_lstm = evaluate_lstm_lofo_kfold(
    fish_sequences_padded=fish_sequences_padded,
    fish_labels_encoded=fish_labels_encoded,
    fish_nums=fish_nums,
    lopo_pairs=lopo_pairs,
    random_lopo_pairs=random_lopo_pairs,
    input_shape=input_shape,
    use_all_pairs=False  
)
print(summary_df_lstm)


Running LOPO on 5 pairs...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 891ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 821ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 940ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 931ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 790ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 756ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 840ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 787ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
  Model Name  Mean LOPO Accuracy  Mean K-Fold Accuracy
0       LSTM                 0.8                   0.8


In [13]:
# For all pairs
summary_df_lstm = evaluate_lstm_lofo_kfold(
    fish_sequences_padded=fish_sequences_padded,
    fish_labels_encoded=fish_labels_encoded,
    fish_nums=fish_nums,
    lopo_pairs=lopo_pairs,
    random_lopo_pairs=random_lopo_pairs,
    input_shape=input_shape,
    use_all_pairs=True 
)
print(summary_df_lstm)


Running LOPO on 63 pairs...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 736ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 749ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 717ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 777ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 738ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 702ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 775ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 824ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 877ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 875ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 942ms/step
[1m1/1[0m [32m━━━━━━━━━