<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Integrating CCA and PLS with Modern ML on Linnerud
This notebook:
 - Loads the Linnerud exercise-physiology dataset (two views X and Y)
 - Uses optimized pipelines for CCA (structure) and PLS (prediction)
 - Cross-validates components; exports figures to `Figures/` for LaTeX


In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_linnerud
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import CCA, PLSRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

plt.rcParams["figure.dpi"] = 150
plt.rcParams["figure.figsize"] = (7.2, 5.2)
sns.set(style="whitegrid")

FIG_DIR = Path("Figures")
FIG_DIR.mkdir(exist_ok=True)

def savefig(path):
    plt.tight_layout()
    plt.savefig(FIG_DIR / path, bbox_inches="tight")
    plt.close()

# Data: Linnerud (two-view)
X: exercise [Chins, Situps, Jumps]
Y: physiology [Weight, Waist, Pulse]

In [None]:
lin = load_linnerud()
X = pd.DataFrame(lin["data"], columns=lin["feature_names"])          # exercise
Y = pd.DataFrame(lin["target"], columns=lin["target_names"])         # physiology

display(X.head(3))
display(Y.head(3))

# Block correlation heatmap
XY = pd.concat([X.add_prefix("X_"), Y.add_prefix("Y_")], axis=1).corr()
plt.figure(figsize=(7.5,6))
sns.heatmap(XY, cmap="coolwarm", center=0, square=True, cbar_kws={"shrink":0.8})
plt.title("Block Correlation: Linnerud Exercise (X) vs Physiology (Y)")
savefig("linnerud_block_corr.png")

Unnamed: 0,Chins,Situps,Jumps
0,5.0,162.0,60.0
1,2.0,110.0,60.0
2,12.0,101.0,101.0


Unnamed: 0,Weight,Waist,Pulse
0,191.0,36.0,50.0
1,189.0,37.0,52.0
2,193.0,38.0,58.0


# Helpers: CCA and PLS with proper per-block scaling

In [None]:
class CCAWrapper:
    """CCA with internal StandardScaler on X and Y; supports CV scoring."""
    def __init__(self, n_components=1, max_iter=5000, tol=1e-06):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol
        self.x_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        self.model = CCA(n_components=n_components, max_iter=max_iter, tol=tol)

    def fit(self, X, Y):
        Xs = self.x_scaler.fit_transform(X)
        Ys = self.y_scaler.fit_transform(Y)
        self.model = CCA(n_components=self.n_components, max_iter=self.max_iter, tol=self.tol)
        self.model.fit(Xs, Ys)
        return self

    def transform(self, X, Y):
        Xs = self.x_scaler.transform(X)
        Ys = self.y_scaler.transform(Y)
        U, V = self.model.transform(Xs, Ys)
        return U, V

    def mean_pairwise_corr(self, X, Y):
        U, V = self.transform(X, Y)
        m = min(U.shape[1], V.shape[1])
        corrs = [np.corrcoef(U[:,i], V[:,i])[0,1] for i in range(m)]
        return np.mean(corrs), np.array(corrs)


class PLSWithTargetScaling:
    """PLS2 with StandardScaler on both X and Y; sklearn-compatible score (mean R^2)."""
    def __init__(self, n_components=1):
        self.n_components = n_components
        self.x_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        self.model = PLSRegression(n_components=n_components)

    def fit(self, X, Y):
        Xs = self.x_scaler.fit_transform(X)
        Ys = self.y_scaler.fit_transform(Y)
        self.model = PLSRegression(n_components=self.n_components)
        self.model.fit(Xs, Ys)
        return self

    def predict(self, X):
        Xs = self.x_scaler.transform(X)
        Yhat_scaled = self.model.predict(Xs)
        return self.y_scaler.inverse_transform(Yhat_scaled)

    def score(self, X, Y):
        Yhat = self.predict(X)
        return r2_score(Y, Yhat, multioutput="uniform_average")

# CCA: Cross-validate canonical correlations and visualize U1 vs V1

In [None]:
def cv_cca(X, Y, comp_grid=(1,2,3), n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    results = []
    for ncomp in comp_grid:
        ccv = []
        for tr, te in kf.split(X):
            cca = CCAWrapper(n_components=ncomp)
            cca.fit(X.iloc[tr], Y.iloc[tr])
            mcc, corrs = cca.mean_pairwise_corr(X.iloc[te], Y.iloc[te])
            ccv.append((mcc, corrs))
        mean_mcc = np.mean([t[0] for t in ccv])
        mean_vec = np.mean(np.vstack([t[1] for t in ccv]), axis=0)
        results.append((ncomp, mean_mcc, mean_vec))
    return results

cca_cv = cv_cca(X, Y, comp_grid=(1,2,3), n_splits=5)
cca_cv

# Plot mean canonical correlations by component
labels = []
vals = []
for ncomp, _, mean_vec in cca_cv:
    for i, v in enumerate(mean_vec, start=1):
        labels.append(f"K={ncomp} comp {i}")
        vals.append(v)

plt.figure()
sns.barplot(x=labels, y=vals)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Held-out canonical correlation")
plt.title("CCA: Cross-validated canonical correlations")
savefig("cca_corrs.png")

# Fit best (by mean of first pair)
best_k = max(cca_cv, key=lambda t: t[1])[0]
cca_best = CCAWrapper(n_components=best_k).fit(X, Y)
U, V = cca_best.transform(X, Y)

# U1 vs V1 scatter
plt.figure()
plt.scatter(U[:,0], V[:,0], alpha=0.8)
plt.xlabel("U1 (exercise)")
plt.ylabel("V1 (physiology)")
plt.title(f"CCA: U1 vs V1 (K={best_k})")
savefig("cca_u1_v1.png")

# PLS: Cross-validate n_components, then compare against Gradient Boosting

In [None]:
def cv_pls(X, Y, comp_grid=(1,2,3), n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    rows = []
    for ncomp in comp_grid:
        scores = []
        per_target = []
        for tr, te in kf.split(X):
            pls = PLSWithTargetScaling(n_components=ncomp).fit(X.iloc[tr], Y.iloc[tr])
            Yhat = pls.predict(X.iloc[te])
            scores.append(r2_score(Y.iloc[te], Yhat, multioutput="uniform_average"))
            per_target.append([
                r2_score(Y.iloc[te].iloc[:,j], Yhat[:,j]) for j in range(Y.shape[1])
            ])
        rows.append({
            "n_components": ncomp,
            "mean_R2": float(np.mean(scores)),
            "per_target_mean": list(np.mean(np.array(per_target), axis=0))
        })
    return pd.DataFrame(rows)

pls_cv = cv_pls(X, Y, comp_grid=(1,2,3))
pls_cv

# Plot mean R^2 by components (plus per-target)
plt.figure()
plt.plot(pls_cv["n_components"], pls_cv["mean_R2"], marker="o", label="Mean R^2")
for j, name in enumerate(Y.columns):
    plt.plot(pls_cv["n_components"],
             [v[j] for v in pls_cv["per_target_mean"]],
             marker="o", label=f"{name} R^2")
plt.xlabel("PLS components")
plt.ylabel("Cross-validated R^2")
plt.title("PLS: CV performance vs number of components")
plt.legend()
savefig("pls_cv_r2.png")

# Train/test comparison with ensemble
X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.25, random_state=42)

best_pls_k = int(pls_cv.sort_values("mean_R2", ascending=False)["n_components"].iloc[0])
pls_best = PLSWithTargetScaling(n_components=best_pls_k).fit(X_tr, Y_tr)
Yhat_pls = pls_best.predict(X_te)
r2_pls = [r2_score(Y_te.iloc[:,j], Yhat_pls[:,j]) for j in range(Y.shape[1])]

gbr = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gbr.fit(X_tr, Y_tr)
Yhat_gbr = gbr.predict(X_te)
r2_gbr = [r2_score(Y_te.iloc[:,j], Yhat_gbr[:,j]) for j in range(Y.shape[1])]

# Barplot compare
labels = list(Y.columns)
x = np.arange(len(labels))
w = 0.35
plt.figure()
plt.bar(x - w/2, r2_pls, width=w, label=f"PLS (K={best_pls_k})")
plt.bar(x + w/2, r2_gbr, width=w, label="GradientBoosting")
plt.axhline(0, color="black", linewidth=0.7)
plt.xticks(x, labels)
plt.ylabel("Test R^2")
plt.title("PLS vs Ensemble (test R^2 per target)")
plt.legend()
savefig("ensemble_vs_pls_r2.png")