# Import libraries and open data

In [56]:
from default import *
%cd -q {PROJECT_HOME}

from dataset import FixedPrattTrussDatasetSingleTarget
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import numpy as np

# Set dataset dirs
train_dataset_dir = r"data/dataset/pratt_truss_bridge_isostatic/0_uniform_ea"
test_dataset_dir = r"data/dataset/pratt_truss_bridge_isostatic/test"

In [57]:
def convert_dataset(filepath):
    ds = FixedPrattTrussDatasetSingleTarget(filepath)

    X = []
    y = []

    for data in ds:
        x_i, y_i, _, _, _ = data
        X.append(x_i)
        y.append(y_i)

    return np.array(X), np.array(y)


X_train, y_train = convert_dataset(f"{train_dataset_dir}/train_4096.hdf5")
X_validation, y_validation = convert_dataset(f"{test_dataset_dir}/uniform_8192.hdf5")

# Regression

In [62]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, make_scorer
from scipy.stats import loguniform, uniform
import numpy as np

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('pca', PCA(n_components=0.95)),
    ('regression', LinearRegression())
])

# Define search space
param_distributions = {
    'pca__n_components': uniform(0, 1)
}

# Halving Random Search CV
search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_percentage_error",
    n_candidates=1000,
    factor=3,
    random_state=42,
    cv=10,
    n_jobs=-1
)

# Fit the search
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict
y_train_pred = best_pipeline.predict(X_train)
y_val_pred = best_pipeline.predict(X_validation)

# Metrics function
def compute_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true * 1e-6, y_pred * 1e-6)  # MSE in millions
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return r2, mse, rmse, mape

# Compute metrics
r2_train, mse_train, rmse_train, mape_train = compute_metrics(y_train, y_train_pred)
r2_val, mse_val, rmse_val, mape_val = compute_metrics(y_validation, y_val_pred)

# Print results
print("=== BEST HYPERPARAMETERS ===")
print(search.best_params_)

print("\n=== TRAIN SET ===")
print(f"R2     : {r2_train:.4f}")
print(f"MSE    : {mse_train:.4f} MN^2")
print(f"RMSE   : {rmse_train:.4f} MN^2")
print(f"MAPE   : {mape_train:.4%}")

print("\n=== VALIDATION SET ===")
print(f"R2     : {r2_val:.4f}")
print(f"MSE    : {mse_val:.4f} MN^2")
print(f"RMSE   : {rmse_val:.4f} MN")
print(f"MAPE   : {mape_val:.4%}")


=== BEST HYPERPARAMETERS ===
{'pca__n_components': 0.7607850486168974}

=== TRAIN SET ===
R2     : 0.3280
MSE    : 5856027.5000 MN^2
RMSE   : 2419.9231 MN^2
MAPE   : 98.8529%

=== VALIDATION SET ===
R2     : 0.3188
MSE    : 5933212.0000 MN^2
RMSE   : 2435.8186 MN
MAPE   : 99.9747%


# Ridge regression

In [64]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, make_scorer
from scipy.stats import loguniform, uniform
import numpy as np

# Define pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('pca', PCA(n_components=0.95)),
    ('regression', Ridge())
])

# Define search space
param_distributions = {
    'pca__n_components': uniform(0, 1),
    'regression__alpha': loguniform(1e-6, 1e4)
}

# Halving Random Search CV
search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_percentage_error",
    n_candidates=1000,
    factor=3,
    random_state=42,
    cv=10,
    n_jobs=-1
)

# Fit the search
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict
y_train_pred = best_pipeline.predict(X_train)
y_val_pred = best_pipeline.predict(X_validation)

# Metrics function
def compute_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true * 1e-6, y_pred * 1e-6)  # MSE in millions
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return r2, mse, rmse, mape

# Compute metrics
r2_train, mse_train, rmse_train, mape_train = compute_metrics(y_train, y_train_pred)
r2_val, mse_val, rmse_val, mape_val = compute_metrics(y_validation, y_val_pred)

# Print results
print("=== BEST HYPERPARAMETERS ===")
print(search.best_params_)

print("\n=== TRAIN SET ===")
print(f"R2     : {r2_train:.4f}")
print(f"MSE    : {mse_train:.4f} MN^2")
print(f"RMSE   : {rmse_train:.4f} MN^2")
print(f"MAPE   : {mape_train:.4%}")

print("\n=== VALIDATION SET ===")
print(f"R2     : {r2_val:.4f}")
print(f"MSE    : {mse_val:.4f} MN^2")
print(f"RMSE   : {rmse_val:.4f} MN")
print(f"MAPE   : {mape_val:.4%}")

=== BEST HYPERPARAMETERS ===
{'pca__n_components': 0.9446142194740232, 'regression__alpha': 6.799551741277519}

=== TRAIN SET ===
R2     : 0.3348
MSE    : 5797111.5000 MN^2
RMSE   : 2407.7192 MN^2
MAPE   : 98.4369%

=== VALIDATION SET ===
R2     : 0.3255
MSE    : 5874749.0000 MN^2
RMSE   : 2423.7881 MN
MAPE   : 99.9182%


# Lasso

In [65]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from scipy.stats import loguniform
import numpy as np

# Define pipeline with Lasso instead of Ridge
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('pca', PCA(n_components=0.95)),
    ('regression', Lasso(max_iter=10000))
])

# Define hyperparameter search space
param_distributions = {
    'pca__n_components': uniform(0, 1),
    'regression__alpha': loguniform(1e-6, 1e4)
}

# Halving Random Search CV optimizing for MAPE
search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_percentage_error",
    n_candidates=1000,
    factor=3,
    random_state=42,
    cv=10,
    n_jobs=-1
)

# Fit the model
search.fit(X_train, y_train)

# Best model
best_pipeline = search.best_estimator_

# Predict
y_train_pred = best_pipeline.predict(X_train)
y_val_pred = best_pipeline.predict(X_validation)

# Evaluation metrics
def compute_metrics(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true * 1e-6, y_pred * 1e-6)  # MSE in MN²
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    return r2, mse, rmse, mape

# Compute train/val metrics
r2_train, mse_train, rmse_train, mape_train = compute_metrics(y_train, y_train_pred)
r2_val, mse_val, rmse_val, mape_val = compute_metrics(y_validation, y_val_pred)

# Display results
print("=== BEST HYPERPARAMETERS ===")
print(search.best_params_)

print("\n=== TRAIN SET ===")
print(f"R2     : {r2_train:.4f}")
print(f"MSE    : {mse_train:.4f} MN²")
print(f"RMSE   : {rmse_train:.4f} MN")
print(f"MAPE   : {mape_train:.4%}")

print("\n=== VALIDATION SET ===")
print(f"R2     : {r2_val:.4f}")
print(f"MSE    : {mse_val:.4f} MN²")
print(f"RMSE   : {rmse_val:.4f} MN")
print(f"MAPE   : {mape_val:.4%}")


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

=== BEST HYPERPARAMETERS ===
{'pca__n_components': 0.7506147516408583, 'regression__alpha': 117.04352492209394}

=== TRAIN SET ===
R2     : 0.3280
MSE    : 5856028.0000 MN²
RMSE   : 2419.9231 MN
MAPE   : 98.8529%

=== VALIDATION SET ===
R2     : 0.3188
MSE    : 5933212.5000 MN²
RMSE   : 2435.8186 MN
MAPE   : 99.9748%
