In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
plt.style.use('seaborn-talk')
%matplotlib notebook

In [None]:
# Load the data set:
data = pd.read_csv('Data/gasoline.csv')
# Each row contain a measured spectrum and a corresponding octane number
# Extract the octane numbers:
yvars = ['octane']
# Extract the spectra:
xvars = [i for i in data.columns if i not in yvars]
wavelengths = [int(i.split()[0].split('.')[1]) for i in xvars]
print(f'Number of wavelengths measured: {len(xvars)}')
data.describe()

In [None]:
# Let us visualize the spectra, just to see what we have to work with:
spec = data[xvars].to_numpy()
octane = data['octane'].to_numpy()

# Plot the first spectra:
figi = plt.figure(constrained_layout=True)
axi = figi.add_subplot(projection='3d')
axi.set_title('NIR spectra')

ribbon = 4
xticks = []
xticks_labels = []
for i, speci in enumerate(spec):
    x = wavelengths
    X = np.vstack((x, x)).T
    Y = np.full_like(X, 5 * i)
    xticks.append(5 * i)
    xticks_labels.append(i + 1)
    Y[:,1] = Y[:,0] + ribbon
    Z = np.vstack((speci, speci)).T
    surf = axi.plot_surface(X,Y,Z, rstride=1, cstride=1, cmap='Spectral', vmin=spec.min(), vmax=spec.max(), lw=1)
    if i >= 9:
        break
axi.grid(False)
axi.w_xaxis.pane.set_visible(False)
axi.w_yaxis.pane.set_visible(False)
axi.set_yticks(xticks[::2])
axi.set_yticklabels(xticks_labels[::2])
axi.set_xlabel('Wavelength (nm)', labelpad=10)
axi.set_ylabel('Spectrum no.', labelpad=10)

 
# Plot the spectrum with the lowest and highest octane number:
idxmin = np.argmin(octane)
idxmax = np.argmax(octane)
figj, axj = plt.subplots(constrained_layout=True)
axj.plot(wavelengths, spec[idxmin], label=f'Octane: {octane[idxmin]}')
axj.plot(wavelengths, spec[idxmax], label=f'Octane: {octane[idxmax]}')
axj.set_title('Highest and lowest octane numbers')
axj.set_xlabel('Wavelength (nm)')
axj.legend()
# And make a histogram of the octane numbers so we know what values we are working with:
figk, axk = plt.subplots(constrained_layout=True)
axk.hist(octane)
axk.set_title('Octane numbers');

In [None]:
from sklearn.preprocessing import scale
X = scale(data[xvars])
Y = scale(data[yvars])

In [None]:
# Create a PLS model to relate the octane content to the NIR spectra:
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=5, scale=False)
pls.fit(X, Y)
Y_hat = pls.predict(X)
B_PLS = pls.coef_

In [None]:
# Create a linear model to relate the octane content to the NIR spectra:
from sklearn.linear_model import LinearRegression
linear = LinearRegression(fit_intercept=False)
linear.fit(X, Y)
Y_hat_lin = linear.predict(X)
B_MLR = linear.coef_.T

In [None]:
from sklearn.metrics import r2_score
figi, (axi, axj) = plt.subplots(constrained_layout=True, ncols=2, nrows=1, sharex=True, sharey=True)
axi.set_title('PLS')
axi.scatter(Y, Y_hat, label=f'r² = {r2_score(Y, Y_hat):6.4f}')
axi.set(xlabel='y', ylabel='ŷ')
axj.set_title('Least squares')
axj.scatter(Y, Y_hat_lin, label=f'r² = {r2_score(Y, Y_hat_lin):6.4f}')
axj.set(xlabel='y', ylabel='ŷ')
axi.legend();
axj.legend();

In [None]:
figi, (axi, axj) = plt.subplots(constrained_layout=True, ncols=1, nrows=2)
axi.set_title('PLS coefficients')
axi.plot(wavelengths, B_PLS[:, 0])
axi.set(xlabel='Wavelength (nm)', ylabel='Coeff.')
axi.axhline(y=0.0, ls=':', color='k')
axj.set_title('Least squares coefficients')
axj.plot(wavelengths, B_MLR[:, 0])
axj.axhline(y=0.0, ls=':', color='k')
axj.set(xlabel='Wavelength (nm)', ylabel='Coeff.');

In [None]:
# Try it all again with a training and test set:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.3,
)

In [None]:
pls = PLSRegression(n_components=5, scale=False)
pls.fit(X_train, Y_train)
Y_hat_train = pls.predict(X_train)
Y_hat_test = pls.predict(X_test)

In [None]:
linear = LinearRegression(fit_intercept=False)
linear.fit(X_train, Y_train)
Y_hat_lin_train = linear.predict(X_train)
Y_hat_lin_test = linear.predict(X_test)

In [None]:
figi, (axi, axj) = plt.subplots(constrained_layout=True, ncols=2, nrows=1, sharex=True, sharey=True)
axi.scatter(Y_train, Y_hat_train, label=f'Train, r² = {r2_score(Y_train, Y_hat_train):6.4f}')
axi.scatter(Y_test, Y_hat_test, label=f'Test, r² = {r2_score(Y_test, Y_hat_test):6.4f}', marker='X')
axj.scatter(Y_train, Y_hat_lin_train, label=f'Train, r² = {r2_score(Y_train, Y_hat_lin_train):6.4f}')
axj.scatter(Y_test, Y_hat_lin_test, label=f'Test, r² = {r2_score(Y_test, Y_hat_lin_test):6.4f}', marker='X')
axi.set_title('PLS')
axj.set_title('Least squares')
axi.set(xlabel='y', ylabel='ŷ')
axj.set(xlabel='y', ylabel='ŷ')
axi.legend();
axj.legend();

In [None]:
# Repeat training and testing 20 times:
pls_test = []
ls_test = []
for i in range(20):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
    pls = PLSRegression(n_components=5, scale=False)
    pls.fit(X_train, Y_train)
    Y_hat_train = pls.predict(X_train)
    Y_hat_test = pls.predict(X_test)
    pls_test.append(r2_score(Y_test, Y_hat_test))
    linear = LinearRegression(fit_intercept=False)
    linear.fit(X_train, Y_train)
    Y_hat_lin_train = linear.predict(X_train)
    Y_hat_lin_test = linear.predict(X_test)
    ls_test.append(r2_score(Y_test, Y_hat_lin_test))
figi, axi = plt.subplots(constrained_layout=True)
x = np.arange(len(pls_test))
axi.plot(x, pls_test, marker='o', label='PLS')
axi.axhline(y=np.mean(pls_test), label='Avg. PLS', ls=':', color='k')
axi.plot(x, ls_test, marker='X', label='LS')
axi.axhline(y=np.mean(ls_test), label='Avg. LS', ls='--', color='k')
axi.set(xlabel='Test no.', ylabel='r² (test)')
axi.legend();

In [None]:
# Use training and testing for checking the performance as a function of PLS components:
pls_test2 = []
for comp in range(1, 11):
    score = []
    for i in range(10):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
        pls = PLSRegression(n_components=comp, scale=False)
        pls.fit(X_train, Y_train)
        Y_hat_test = pls.predict(X_test)
        score.append(r2_score(Y_test, Y_hat_test))
    pls_test2.append([np.mean(score), np.std(score)])
pls_test2 = np.array(pls_test2)
figi, axi = plt.subplots(constrained_layout=True)
axi.errorbar(range(1, 11), pls_test2[:, 0], yerr=pls_test2[:, 1], marker='o')
axi.set_xlabel('No. of components')
axi.set_ylabel('r² (test)');