In [2]:
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

print("Loading data...")
X_train = np.load('../data/regression/X_train.npy')
X_test = np.load('../data/regression/X_test.npy')
y_train = np.load('../data/regression/y_train.npy')
y_test = np.load('../data/regression/y_test.npy')

if y_train.ndim > 1:
    y_train = y_train.ravel()
if y_test.ndim > 1:
    y_test = y_test.ravel()

print(f"Training data shape: X={X_train.shape}, y={y_train.shape}")
print(f"Test data shape: X={X_test.shape}, y={y_test.shape}")
print(f"Features/Samples ratio: {X_train.shape[1]/X_train.shape[0]:.2f}")

print("\n")
print("METHOD 1: RIDGE REGRESSION")

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

param_grid_ridge = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid_ridge = GridSearchCV(ridge_pipeline, param_grid_ridge, cv=5, scoring='r2', n_jobs=-1)
grid_ridge.fit(X_train, y_train)

print(f"Best alpha: {grid_ridge.best_params_['ridge__alpha']}")
print(f"Best CV score: {grid_ridge.best_score_:.4f}")

best_ridge = grid_ridge.best_estimator_
train_score_ridge = r2_score(y_train, best_ridge.predict(X_train))
test_score_ridge = r2_score(y_test, best_ridge.predict(X_test))

print(f"Ridge - Train R²: {train_score_ridge:.4f}")
print(f"Ridge - Test R²: {test_score_ridge:.4f}")

print("\n")
print("METHOD 2: LASSO REGRESSION")

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(max_iter=2000))
])

param_grid_lasso = {
    'lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

grid_lasso = GridSearchCV(lasso_pipeline, param_grid_lasso, cv=5, scoring='r2', n_jobs=-1)
grid_lasso.fit(X_train, y_train)

print(f"Best alpha: {grid_lasso.best_params_['lasso__alpha']}")
print(f"Best CV score: {grid_lasso.best_score_:.4f}")

best_lasso = grid_lasso.best_estimator_
train_score_lasso = r2_score(y_train, best_lasso.predict(X_train))
test_score_lasso = r2_score(y_test, best_lasso.predict(X_test))

lasso_coef = best_lasso.named_steps['lasso'].coef_
n_selected = np.sum(np.abs(lasso_coef) > 1e-5)

print(f"Lasso - Train R²: {train_score_lasso:.4f}")
print(f"Lasso - Test R²: {test_score_lasso:.4f}")
print(f"Features selected: {n_selected}/{X_train.shape[1]}")

print("\n")
print("METHOD 3: RANDOM FOREST")

param_grid_rf = {
    'n_estimators': [200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print(f"Best parameters: {grid_rf.best_params_}")
print(f"Best CV score: {grid_rf.best_score_:.4f}")

best_rf = grid_rf.best_estimator_
train_score_rf = r2_score(y_train, best_rf.predict(X_train))
test_score_rf = r2_score(y_test, best_rf.predict(X_test))

print(f"Random Forest - Train R²: {train_score_rf:.4f}")
print(f"Random Forest - Test R²: {test_score_rf:.4f}")

print("\n")
print("FINAL RESULTS SUMMARY")
print(f"Ridge Regression Test R²: {test_score_ridge:.4f}")
print(f"Lasso Regression Test R²: {test_score_lasso:.4f}")
print(f"Random Forest Test R²: {test_score_rf:.4f}")

best_score = max(test_score_ridge, test_score_lasso, test_score_rf)
print(f"\nBest Test R²: {best_score:.4f}")
print(f"Target R² > 0.88: {'ON A VALIDE' if best_score > 0.88 else 'PAS VALIDE'}")

Loading data...
Training data shape: X=(200, 200), y=(200,)
Test data shape: X=(200, 200), y=(200,)
Features/Samples ratio: 1.00


METHOD 1: RIDGE REGRESSION
Best alpha: 10
Best CV score: 0.5702
Ridge - Train R²: 0.9897
Ridge - Test R²: 0.7153


METHOD 2: LASSO REGRESSION
Best alpha: 0.01
Best CV score: 0.9165
Lasso - Train R²: 0.9751
Lasso - Test R²: 0.9196
Features selected: 75/200


METHOD 3: RANDOM FOREST
Best parameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 300}
Best CV score: 0.0724
Random Forest - Train R²: 0.7128
Random Forest - Test R²: 0.1663


FINAL RESULTS SUMMARY
Ridge Regression Test R²: 0.7153
Lasso Regression Test R²: 0.9196
Random Forest Test R²: 0.1663

Best Test R²: 0.9196
Target R² > 0.88: ON A VALIDE
