In [1]:
#import src.functions as func
from src.functions import load_data, scale, baseline_model_fit_save, bootstrap_evaluate, compute_confidence_interval, plot_metrics_boxplot
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.svm import SVR

In [2]:
dev_data_path = "data/assignment1_dev_set.csv"
val_data_path = "data/assignment1_val_set.csv"
# Load and split data
X_train, y_train = load_data(dev_data_path)
X_test, y_test = load_data(val_data_path)

X_train, X_test = scale(X_train, X_test)

In [None]:
# Define models
models = {
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "SVR": SVR(C=1.0, epsilon=0.2),
    "BayesianRidge": BayesianRidge()
}

evaluation_results = []

# Evaluate each model on the development dataset and then on the evaluation dataset
for name, model in models.items():
    
    baseline = baseline_model_fit_save(model, name, X_train=X_train, y_train=y_train)
    
    print(f"Bootstrapping evaluation for {name} on evaluation data...")
    eval_metrics = bootstrap_evaluate(model, X_test, y_test, n_bootstraps=100)
    rmse_ci = compute_confidence_interval(eval_metrics['RMSE'])
    mae_ci = compute_confidence_interval(eval_metrics['MAE'])
    r2_ci = compute_confidence_interval(eval_metrics['R2'])

    print("95% CI for RMSE:", rmse_ci)
    print("95% CI for MAE:", mae_ci)
    print("95% CI for R2:", r2_ci)
    eval_metrics['model'] = name
    evaluation_results.append(eval_metrics)

# Combine evaluation results for boxplot comparison
combined_eval_df = pd.concat(evaluation_results, ignore_index=True)


# Plot the evaluation metrics for all models
plot_metrics_boxplot(combined_eval_df, save_path="plots/baseline_evaluation_metrics_boxplot.png")

Bootstrapping evaluation for ElasticNet on evaluation data...
Bootstrapping evaluation for SVR on evaluation data...
Bootstrapping evaluation for BayesianRidge on evaluation data...


TypeError: list indices must be integers or slices, not str

In [None]:
combined_eval_df

Unnamed: 0,RMSE,MAE,R2,model
0,5.254602,3.047898,-0.743526,ElasticNet
1,4.321801,2.779842,-0.433706,ElasticNet
2,5.045784,3.021127,-0.793157,ElasticNet
3,4.748118,2.970512,-0.298143,ElasticNet
4,4.371578,2.712898,-0.191556,ElasticNet
...,...,...,...,...
295,4.340079,2.902725,0.042512,BayesianRidge
296,3.533568,2.305625,0.102017,BayesianRidge
297,3.627750,2.530260,0.070799,BayesianRidge
298,3.541614,2.328161,0.123179,BayesianRidge


In [None]:

# Step 2: Feature selection
fs_rmse, selected_features, fs_model = func.feature_selection(
    X_train, y_train, X_test, y_test
)
print(f"Feature selection RMSE: {fs_rmse:.4f}")
print(f"Selected {len(selected_features)} features: {selected_features.tolist()}")

# Step 3: Model tuning
best_model, best_params = func.tune_model(X_train, y_train, selected_features)

# Evaluate final model on test set
X_test_selected = X_test[selected_features]
y_pred = best_model.predict(X_test_selected)
final_rmse = np.sqrt(func.mean_squared_error(y_test, y_pred))

print(f"Final tuned model RMSE: {final_rmse:.4f}")
print(f"Best parameters: {best_params}")
    
print({
    'baseline_rmse': baseline_rmse,
    'fs_rmse': fs_rmse,
    'final_rmse': final_rmse,
    'selected_features': selected_features,
    'best_model': best_model,
    'best_params': best_params
})

Feature selection RMSE: 3.9530
Selected 6 features: ['Alistipes putredinis', 'Desulfonispora thiosulfatigenes', 'Ruminococcus champanellensis', 'Sporobacter termitidis', 'Clostridium clariflavum', 'Eubacterium sulci']
Final tuned model RMSE: 3.8996
Best parameters: {'model__alpha': 1, 'model__l1_ratio': np.float64(0.1)}
{'baseline_rmse': np.float64(3.952695856129381), 'fs_rmse': np.float64(3.952984246703387), 'final_rmse': np.float64(3.899607160214836), 'selected_features': Index(['Alistipes putredinis', 'Desulfonispora thiosulfatigenes',
       'Ruminococcus champanellensis', 'Sporobacter termitidis',
       'Clostridium clariflavum', 'Eubacterium sulci'],
      dtype='object'), 'best_model': Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 ElasticNet(alpha=1, l1_ratio=np.float64(0.1),
                            random_state=42))]), 'best_params': {'model__alpha': 1, 'model__l1_ratio': np.float64(0.1)}}
