## Load Packages

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import sklearn
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
from scipy.integrate import trapezoid

In [None]:
import sys
sys.path.append('./')
import utils

## Read Data

In [None]:
kb = pd.read_csv('./Kbrevis_Rrs_1nm_400-720_PACE-OCI.csv', index_col=0)
kb = kb.T
kb = kb.drop([400,402], axis=1)
kb.index

In [None]:
wavelengths = list(kb.columns)
kb_area_norm = pd.DataFrame(columns=kb.index)
for station in kb.index:
    spectrum = kb.T[station]
    area = trapezoid(spectrum, wavelengths)
    kb_area_norm[station] = spectrum/area

In [None]:
kb_area_norm.plot(legend=False)

In [None]:
kb.T.plot(legend=False)

In [None]:
target_df = pd.read_csv('./kb_cellcounts.csv', index_col=0)
target_df

In [None]:
# run when using area normalized spectrum
# kb_area_norm = kb_area_norm.T
# kb = kb_area_norm

## Add and Select Features

In [None]:
# adding features
kb['kbbi'] = (kb[678] - kb[667])/(kb[678] + kb[667])
kb['rbd'] = (kb[678] - kb[667])
kb['ss_490'] = (kb[487]-kb[444] - (kb[532] - kb[444])*(487-444)/(532-444))
kb['bbp_555'] = 2.058*kb[555]-0.00182
kb['ndci'] = (kb[709] - kb[665])/(kb[709] + kb[665])
kb['kb_cellcount'] = target_df['CellsperL']

In [None]:
kb

In [None]:
corr = kb.corr()
# fig, ax = plt.subplots()
# sns.heatmap(corr, vmax=1, vmin=-1, annot=True, cmap='PiYG', mask=np.triu(corr)).set(title="Pearson Correlation Features with KB")
corr

In [None]:
irelevant_features = corr[abs(corr)<0.5]['kb_cellcount'].dropna().index

In [None]:
kb_refined = kb.drop(irelevant_features, axis=1)

In [None]:
corr_ = kb_refined.corr()
# fig, ax = plt.subplots()
print(len(kb_refined.columns)-1)
# sns.heatmap(corr, vmax=1, vmin=-1, annot=True, cmap='PiYG', mask=np.triu(corr)).set(title="Pearson Correlation Features with KB")

In [None]:
print(list(kb_refined.columns))

## Prepare Feature and Target Vectors

In [None]:
X = kb_refined.drop(['kb_cellcount'], axis=1).to_numpy()
y = target_df['CellsperL'].to_numpy()

In [None]:
X.shape, y.shape

In [None]:
y = np.log(y+1)
y.shape

## Stratified Train-Test Split

In [None]:
# binning for stratified sampling
bins = np.quantile(y, [0, 0.25, 0.5, 0.75, 1.0])  # quartile-based bins
y_binned = np.digitize(y, bins, right=True)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, stratify=y_binned, random_state=777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Initialize and Fit Model

In [None]:
model = sklearn.svm.SVR(kernel='rbf', C=10, gamma='scale', epsilon=0.1)

In [None]:
model.fit(X_train, y_train)

## Get Predictions

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test, y_pred

In [None]:
y_pred_linear = np.expm1(y_pred)
y_test_linear = np.expm1(y_test)

In [None]:
y_pred_linear, y_test_linear

## Save Model to disk

In [None]:
with open('svr_model.pkl', 'wb') as f:
    pickle.dump(model, f)

## Model Validation and Assessment

In [None]:
plt.rcdefaults()
sns.set_theme(rc={"figure.dpi": 300})
sns.set_context("poster")
sns.set_style(style="ticks")

def set_font_size(fontsize):
    plt.rc('font', size=fontsize)
    plt.rc('axes', titlesize=fontsize)
    plt.rc('axes', labelsize=fontsize)
    plt.rc('xtick', labelsize=fontsize)
    plt.rc('ytick', labelsize=fontsize)
    plt.rc('legend', fontsize=fontsize)

In [None]:
plt.rcParams.update({
    'lines.linewidth': 1.5,
    'axes.linewidth': 1.0,
    'xtick.major.width': 1.0,
    'ytick.major.width': 1.0,
    'xtick.minor.width': 0.8,
    'ytick.minor.width': 0.8,
    'xtick.direction': 'out',
    'ytick.direction': 'out',
})

In [None]:
set_font_size(12)
utils.get_validation_plot(x=y_test_linear+5, y=y_pred_linear+5, color_val="black", metrics=['r2', 'mae', 'nrmse', 'bias', 'mdsa'], log_norm=True, trendline=False, 
                          xlabel_val="Observed $K. brev$ Cells/L", ylabel_val="Modeled $K. brev$ Cells/L", marker_border="white", marker_size=55)

In [None]:
set_font_size(12)
utils.get_validation_plot(x=y_test_linear+5, y=y_pred_linear+5, color_val="black", metrics=['r2', 'mae', 'nrmse', 'bias', 'mdsa'], log_norm=True, trendline=False, 
                          xlabel_val="Observed $K. brev$ Cells/L", ylabel_val="Modeled $K. brev$ Cells/L", marker_border="white", marker_size=55)

## Plot Feature Importances

In [None]:
features = kb_refined.columns[:-1]
features = [str(feature) for feature in features]

In [None]:
set_font_size(16)
from sklearn.inspection import permutation_importance

# Compute permutation importance on test set
result = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, scoring='neg_root_mean_squared_error')

import matplotlib.pyplot as plt
import numpy as np

# Plot
sorted_idx = result.importances_mean.argsort()[::-1]
top_n = 15  # Top N features to display
top_features = np.array(features)[sorted_idx][:top_n]

plt.figure(figsize=(10, 6))
plt.barh(top_features[::-1], result.importances_mean[sorted_idx][:top_n][::-1])
plt.xlabel("Mean decrease in RMSE")
plt.title("Permutation Feature Importance (SVR) - Rrs")
plt.tight_layout()
plt.show()

In [None]:
features = kb_refined.columns[:-1]
features = [str(feature) for feature in features]

In [None]:
set_font_size(16)
from sklearn.inspection import permutation_importance

# Compute permutation importance on test set
result = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, scoring='neg_root_mean_squared_error')

import matplotlib.pyplot as plt
import numpy as np

# Plot
sorted_idx = result.importances_mean.argsort()[::-1]
top_n = 15  # Top N features to display
top_features = np.array(features)[sorted_idx][:top_n]

plt.figure(figsize=(10, 6))
plt.barh(top_features[::-1], result.importances_mean[sorted_idx][:top_n][::-1])
plt.xlabel("Mean decrease in RMSE")
plt.title("Permutation Feature Importance (SVR) - Area Normalized Spectra")
plt.tight_layout()
plt.show()