In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from joblib import Parallel, delayed
import os
from scipy.stats import pearsonr

# ----------------------------
# Proximal operator for MCP
# ----------------------------
def prox_mcp(v, alpha, a=2.0):
    """
    Proximal operator for MCP regularization.

    alpha corresponds to the lambda * step_size term.
    a is the MCP parameter.
    """
    abs_v = torch.abs(v)

    # Calculate the threshold based on the preconditioner update rule
    threshold = alpha

    # The two main thresholds for the piecewise function
    cond1 = abs_v <= threshold
    cond2 = (abs_v > threshold) & (abs_v <= a * threshold)

    # Initialize the result tensor with zeros
    result = torch.zeros_like(v)

    # Case 1: |v| <= lambda
    result[cond1] = 0.0

    # Case 2: lambda < |v| <= a * lambda
    if cond2.any():
        v_cond2 = v[cond2]
        abs_v_cond2 = abs_v[cond2]

        # Apply the shrinkage formula for the intermediate coefficients
        term1 = torch.sign(v_cond2) * (abs_v_cond2 - alpha)
        result[cond2] = term1 / (1 - 1/a)

    # Case 3: |v| > a * lambda
    # This case does not require any calculation, as the value remains unchanged
    result[abs_v > a * threshold] = v[abs_v > a * threshold]

    return result

# ----------------------------
# Adaptive proximal gradient optimizer (based on Adam)
# ----------------------------
class ProxGEN(optim.Optimizer):
    """
    Adaptive proximal gradient optimizer (based on Adam) for MCP regularization.
    """
    def __init__(self, params, lr=1e-3, lam=1e-6, betas=(0.1, 0.999), eps=1e-8, mcp_a=2.0):
        defaults = dict(lr=lr, lam=lam, betas=betas, eps=eps, mcp_a=mcp_a)
        super(ProxGEN, self).__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                if len(state) == 0:
                    state["step"] = 0
                    state["exp_avg"] = torch.zeros_like(p)
                    state["exp_avg_sq"] = torch.zeros_like(p)

                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                beta1, beta2 = group["betas"]

                state["step"] += 1
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                denom = exp_avg_sq.sqrt().add_(group["eps"])
                step_size = group["lr"]

                # Perform the Adam-like update
                theta_hat = p.data.addcdiv(exp_avg, denom, value=-step_size)

                # The alpha term for the proximal operator
                alpha_prox = group["lam"] * step_size
                mcp_a = group["mcp_a"]

                # Proximal step for MCP regularization
                p.data = prox_mcp(theta_hat, alpha_prox, mcp_a)

        return loss

# ----------------------------
# Simple Linear Model
# ----------------------------
class LinearModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        return self.linear(x)

# ----------------------------
# Metrics Calculation
# ----------------------------
def calculate_sparsity(model):
    """Calculates the sparsity of the model's weights."""
    total_params = 0
    non_zero_params = 0
    for param in model.parameters():
        if param.dim() > 1:
            total_params += param.numel()
            non_zero_params += torch.count_nonzero(param.data).item()
    if total_params == 0:
        return 0.0
    sparsity = (1 - (non_zero_params / total_params)) * 100
    return sparsity

def calculate_correlations(labels, predictions):
    """Calculates the Pearson correlation coefficient for each trait."""
    num_tasks = labels.shape[1]
    correlations = []
    for i in range(num_tasks):
        corr, _ = pearsonr(labels[:, i], predictions[:, i])
        correlations.append(corr)
    return correlations

# ----------------------------
# Load Data
# ----------------------------
data = pd.read_csv("/content/drive/My Drive/Adaptive gradient method/Adaptive gradient method from L_0 to L infnity/Final_pine_data.csv")
Y = data.iloc[:, :7].values
X = data.iloc[:, 7:].values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)
input_dim = X.shape[1]
num_tasks = Y.shape[1]

# ----------------------------
# Training + CV Evaluation
# ----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def train_and_eval(X_train, Y_train, X_val, Y_val, params):
    train_ds = torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(Y_train, dtype=torch.float32),
    )
    val_ds = torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(Y_val, dtype=torch.float32),
    )
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=int(params["batch_size"]), shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=128, shuffle=False)

    model = LinearModel(input_dim, num_tasks).to(device)
    optimizer = ProxGEN(model.parameters(), lr=params["lr"], lam=params["lam"])
    criterion = nn.MSELoss()

    for epoch in range(30):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            preds.append(pred.cpu().numpy())
            labels.append(yb.cpu().numpy())
    preds, labels = np.vstack(preds), np.vstack(labels)
    mse = mean_squared_error(labels, preds)
    return mse

def objective(params):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mses = Parallel(n_jobs=-1)(delayed(train_and_eval)(
        X_trainval[train_idx], Y_trainval[train_idx],
        X_trainval[val_idx], Y_trainval[val_idx],
        params
    ) for train_idx, val_idx in kf.split(X_trainval))
    return {"loss": np.mean(mses), "status": STATUS_OK}

# ----------------------------
# Bayesian Optimization
# ----------------------------
search_space = {
    "lr": hp.loguniform("lr", np.log(1e-4), np.log(1e-2)),
    "lam": hp.loguniform("lam", np.log(1e-3), np.log(1e+2)),
    "batch_size": hp.choice("batch_size", [32, 64, 128]),
}

trials = Trials()
best = fmin(fn=objective, space=search_space, algo=tpe.suggest,
            max_evals=50, trials=trials, rstate=np.random.default_rng(42))

print("Best hyperparameters:", best)
print("Best batch size:", [32, 64, 128][best["batch_size"]])

# ----------------------------
# Final Training on train+val
# ----------------------------
final_model = LinearModel(input_dim, num_tasks).to(device)
final_optimizer = ProxGEN(final_model.parameters(), lr=best["lr"], lam=best["lam"])
criterion = nn.MSELoss()

trainval_ds = torch.utils.data.TensorDataset(
    torch.tensor(X_trainval, dtype=torch.float32),
    torch.tensor(Y_trainval, dtype=torch.float32),
)
trainval_loader = torch.utils.data.DataLoader(trainval_ds,
                                              batch_size=[32, 64, 128][best["batch_size"]],
                                              shuffle=True)

for epoch in range(50):
    final_model.train()
    for xb, yb in trainval_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = final_model(xb)
        loss = criterion(pred, yb)
        final_optimizer.zero_grad()
        loss.backward()
        final_optimizer.step()

# ----------------------------
# Test Evaluation and Metrics Calculation
# ----------------------------
test_ds = torch.utils.data.TensorDataset(
    torch.tensor(X_test, dtype=torch.float32),
    torch.tensor(Y_test, dtype=torch.float32),
)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=128, shuffle=False)

final_model.eval()
preds, labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = final_model(xb)
        preds.append(pred.cpu().numpy())
        labels.append(yb.cpu().numpy())
preds, labels = np.vstack(preds), np.vstack(labels)

# Calculate and print total MSE and R2
total_mse = mean_squared_error(labels, preds)
r2_per_trait = r2_score(labels, preds, multioutput="raw_values")
print("\n--- Final Test Evaluation ---")
print(f"Total Test MSE: {total_mse:.4f}")
print("R2 per trait:", r2_per_trait)

# Calculate and print MSE per trait
mse_per_trait = mean_squared_error(labels, preds, multioutput='raw_values')
print("Mean Test MSE for each trait:", mse_per_trait)

# Calculate and print Pearson correlation coefficient
correlations = calculate_correlations(labels, preds)
print("Pearson correlation coefficient per trait:", correlations)

# Calculate and print sparsity
sparsity = calculate_sparsity(final_model)
print(f"Model sparsity: {sparsity:.2f}%")