In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Step 1: Generate a dataset with 7 highly correlated features
np.random.seed(42)

n_samples = 1000
X_base = np.random.rand(n_samples)

# Create 7 highly correlated features
X1 = X_base + np.random.normal(0, 0.01, n_samples)
X2 = X_base * 0.8 + np.random.normal(0, 0.01, n_samples)
X3 = X_base * 1.2 + np.random.normal(0, 0.01, n_samples)
X4 = X_base + 0.5 * X1 + np.random.normal(0, 0.01, n_samples)
X5 = X_base * 0.7 + np.random.normal(0, 0.01, n_samples)
X6 = X_base + 0.9 * X1 + np.random.normal(0, 0.01, n_samples)
X7 = X_base * 1.1 + np.random.normal(0, 0.01, n_samples)

# Combine into a dataset
X = np.column_stack([X1, X2, X3, X4, X5, X6, X7])
y = 3 * X_base + 2 * X1 + np.random.normal(0, 0.05, n_samples)  # Target variable

# Step 2: Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 4: Train Ridge Regression using different values of alpha (regularization strength)
alphas = [1e-15, 1e-10, 1e-5, 1e-3, 0, 1, 10, 20]
best_r2 = -np.inf
best_alpha = None

for alpha in alphas:
    ridge_reg = Ridge(alpha=alpha)
    ridge_reg.fit(X_train, y_train)
    
    # Predict and calculate R2 score
    y_pred = ridge_reg.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Alpha: {alpha}, R2 Score: {r2}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_alpha = alpha

# Step 5: Output the best alpha and corresponding R2 score
print(f"\nBest Alpha: {best_alpha}")
print(f"Best R2 Score: {best_r2}")


Alpha: 1e-15, R2 Score: 0.9986334852655715
Alpha: 1e-10, R2 Score: 0.9986334852655776
Alpha: 1e-05, R2 Score: 0.9986334858794664
Alpha: 0.001, R2 Score: 0.998633546191726
Alpha: 0, R2 Score: 0.9986334852655715
Alpha: 1, R2 Score: 0.9986169295857686
Alpha: 10, R2 Score: 0.9985480091267668
Alpha: 20, R2 Score: 0.9985286352639124

Best Alpha: 0.001
Best R2 Score: 0.998633546191726


In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Step 1: Generate a dataset with 7 highly correlated features
np.random.seed(42)

n_samples = 1000
X_base = np.random.rand(n_samples)

# Create 7 highly correlated features
X1 = X_base + np.random.normal(0, 0.01, n_samples)
X2 = X_base * 0.8 + np.random.normal(0, 0.01, n_samples)
X3 = X_base * 1.2 + np.random.normal(0, 0.01, n_samples)
X4 = X_base + 0.5 * X1 + np.random.normal(0, 0.01, n_samples)
X5 = X_base * 0.7 + np.random.normal(0, 0.01, n_samples)
X6 = X_base + 0.9 * X1 + np.random.normal(0, 0.01, n_samples)
X7 = X_base * 1.1 + np.random.normal(0, 0.01, n_samples)

# Combine into a dataset
X = np.column_stack([X1, X2, X3, X4, X5, X6, X7])
X

array([[0.37631713, 0.28556892, 0.45819331, ..., 0.26496892, 0.70090013,
        0.42401345],
       [0.93736086, 0.75974039, 1.13435952, ..., 0.66841565, 1.78749417,
        1.04587362],
       [0.73579592, 0.57054795, 0.86636072, ..., 0.49548904, 1.39878351,
        0.80577198],
       ...,
       [0.13644826, 0.10703993, 0.15189792, ..., 0.0922133 , 0.26286243,
        0.14031179],
       [0.94594433, 0.75957224, 1.11952894, ..., 0.66545805, 1.80982828,
        1.04298338],
       [0.43908156, 0.36159903, 0.53434623, ..., 0.31918378, 0.83953769,
        0.48120619]])