# Example 2: High-throughput field phenotyping using hyperspectral reflectance and partial least squares regression (PLSR) reveals genetic modifications to photosynthetic capacity

In the [original article](https://www.sciencedirect.com/science/article/pii/S0034425719301804), the authors built
models to predict photosynthesis from hyperspectral reflectance. Here, we are using their raw data to reproduce one of these models.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.cm import get_cmap
import seaborn as sns


%matplotlib notebook
plt.style.use("seaborn-notebook")
sns.set_theme(style="ticks", context="notebook", palette="muted")

In [None]:
# Imports for modelling:
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
data = pd.read_csv("model1.csv")
data

In [None]:
wavelengths = [i for i in data.columns if "Wave_" in i]
wavelength_nm = [int(i.split("_")[1]) for i in wavelengths]
print("Wavelengths", len(wavelength_nm))
spectra = data[wavelengths].to_numpy()
print("Spectra", spectra.shape)

In [None]:
# Plot all corrected measured spectra:
fig, axi = plt.subplots(constrained_layout=True)
cmap = get_cmap(name="viridis")
colors = cmap(np.linspace(0, 1, len(spectra)))
for i, spec in enumerate(spectra):
    axi.plot(wavelength_nm, spec, color=colors[i])
axi.set(xlabel="Wavelength (nm)", ylabel="Intensity")
sns.despine(fig=fig)

In [None]:
# Preprocessing:
def msc_correct(spectra, mean_spectrum=None):
    """Multiplicative scatter correction for measured spectra"""
    if mean_spectrum is None:
        mean_spectrum = np.mean(spectra, axis=0)
    msc = []
    for i, spectrum in enumerate(spectra):
        # Find a and b parameters:
        param = np.polyfit(mean_spectrum, spectrum, 1)
        corrected = (spectrum - param[1]) / param[0]
        msc.append(corrected)
    msc = np.array(msc)
    return mean_spectrum, msc


mean_spectrum, msc = msc_correct(spectra)

In [None]:
# Plot all corrected measured spectra:
fig, axi = plt.subplots(constrained_layout=True)
cmap = get_cmap(name="viridis")
colors = cmap(np.linspace(0, 1, len(msc)))
for i, spec in enumerate(msc):
    axi.plot(wavelength_nm, spec, color=colors[i])
axi.set(xlabel="Wavelength (nm)", ylabel="Intensity")
sns.despine(fig=fig)

In [None]:
# Define X and y: We are going to predict y from X:
y = data["NitrogenWeight"].to_numpy()
X = msc

In [None]:
# Set up for making the model by creating a training set and test set:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    shuffle=True,
    random_state=4,
)

In [None]:
def make_model(model, X_train, y_train, X_test, y_test):
    """Train a linear model"""
    results = {}
    # Fit model:
    model.fit(X_train, y_train)
    # Use model to predict:
    # Predict using the training set:
    y_hat_train = model.predict(X_train)
    # Predict using the test set:
    y_hat_test = model.predict(X_test)

    results["y_hat_train"] = np.copy(y_hat_train).flatten()
    results["y_hat_test"] = np.copy(y_hat_test).flatten()

    # For training set: calculate R^2 and RMSEC:
    r2_train = r2_score(y_train, y_hat_train)
    print(f"R² (training): {r2_train}")
    results["r2_train"] = r2_train

    # For test set: calculate R^2 and RMSEP:
    r2_test = r2_score(y_test, y_hat_test)
    print(f"\nR² (test): {r2_test}")
    results["r2_test"] = r2_test
    return results

In [None]:
linear = LinearRegression(fit_intercept=False)
results_linear = make_model(linear, X_train, y_train, X_test, y_test)
results_linear["name"] = "Least squares"

In [None]:
pls_model = PLSRegression(n_components=6, scale=True)
results_pls = make_model(pls_model, X_train, y_train, X_test, y_test)
results_pls["name"] = "PLS"

In [None]:
# To visualize the performance of the models, plot what they predict and compare
# that with the true values:
fig, (ax1, ax2) = plt.subplots(
    constrained_layout=True, ncols=2, sharex=True, sharey=True
)
ax1.set_title("Train", loc="left")
ax2.set_title("Test", loc="left")

ax1.scatter(
    y_train,
    results_linear["y_hat_train"],
    label=f'Least squares\n(r² = {results_linear["r2_train"]:.2f})',
)
ax1.scatter(
    y_train,
    results_pls["y_hat_train"],
    label=f'PLS (r² = {results_pls["r2_train"]:.2f})',
)

ax2.scatter(
    y_test,
    results_linear["y_hat_test"],
    label=f'Least squares\n(r² = {results_linear["r2_test"]:.2f})',
)
ax2.scatter(
    y_test,
    results_pls["y_hat_test"],
    label=f'PLS (r² = {results_pls["r2_test"]:.2f})',
)

for axi in (ax1, ax2):
    axi.set(xlabel="Measured mass-% (y)", ylabel="Predicted mass-% (ŷ)")
    axi.set_aspect("equal")
    # Add extra x=y to help reading:
    lim_min = np.min([axi.get_xlim(), axi.get_ylim()])
    lim_max = np.max([axi.get_xlim(), axi.get_ylim()])
    (line,) = axi.plot(
        [lim_min, lim_max], [lim_min, lim_max], ls=":", color="k"
    )
    axi.set_xlim(lim_min, lim_max)
    axi.set_ylim(lim_min, lim_max)
    axi.legend(labelspacing=1.0)
sns.despine(fig=fig)