In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,root_mean_squared_error
from sklearn.model_selection import train_test_split

Ridge with strong multicollinearity (2 features)

In [2]:
d=pd.read_csv('ridge_correlated_150.csv')
x=d[['x1','x2']]
y=d['y']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
alphas = np.logspace(-3, 3, 50)
ridge = RidgeCV(alphas=alphas)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)
print("Best alpha:", ridge.alpha_)
print("Coefficients:", ridge.coef_)
print("Intercept:", ridge.intercept_)
print("R² :", r2_score(y_test, y_pred))
print("RMSE :", root_mean_squared_error(y_test, y_pred))


Best alpha: 0.0071968567300115215
Coefficients: [ 4.41273599 -3.41175106]
Intercept: 0.012383721734327238
R² : 0.6538727524922596
RMSE : 0.5705925764633385


Ridge with 10 moderately collinear features

In [3]:
a=pd.read_csv('ridge_10feat_150.csv')
X = a.drop(columns="y")
y = a["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
alphas = np.logspace(-3, 3, 50)
ridge = RidgeCV(alphas=alphas)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print("Best alpha:", ridge.alpha_)
print("Coefficients:", ridge.coef_)
print("Intercept:", ridge.intercept_)
print("R² :", r2_score(y_test, y_pred))
print("RMSE :", root_mean_squared_error(y_test, y_pred))

Best alpha: 0.655128556859551
Coefficients: [ 2.30095156 -1.28258227  0.28615561  1.55443374 -0.18857766 -0.72907171
 -0.19564402  0.05193988  0.01304106  0.05181519]
Intercept: -0.07437205435647587
R² : 0.9881868768074025
RMSE : 0.6057776327747615


Lasso on high-dimensional sparse data (30 features)


In [4]:
b=pd.read_csv('lasso_sparse_150.csv')
X = b.drop(columns="y")
y = b["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
selected = [col for col, coef in zip(X.columns, lasso.coef_) if coef != 0]
print("Best alpha:", lasso.alpha_)
print("Selected features:", selected)
print("Intercept:", lasso.intercept_)
print("R² :", r2_score(y_test, y_pred))
print("RMSE :", root_mean_squared_error(y_test, y_pred))

Best alpha: 0.06662744526920758
Selected features: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x11', 'x12', 'x13', 'x16', 'x18', 'x21', 'x23', 'x24', 'x26', 'x28', 'x29']
Intercept: -0.07713865817077871
R² : 0.9620885670252837
RMSE : 1.186075160912456


Lasso with grouped/overlapping signals

In [5]:
c=pd.read_csv('lasso_groups_150.csv')
X=c.drop(columns='y')
y=c['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print("Best alpha:", lasso.alpha_)
print("Coefficients:", lasso.coef_)
print("Intercept:", lasso.intercept_)
print("R² :", r2_score(y_test, y_pred))
print("RMSE :", root_mean_squared_error(y_test, y_pred))

Best alpha: 0.003608591334120754
Coefficients: [ 2.04028628 -0.          1.60923693  0.         -0.16289269]
Intercept: -0.06746553791233204
R² : 0.9139898313782668
RMSE : 0.9098913642117108


Quadratic model vs linear baseline

In [10]:
df=pd.read_csv('poly_quadratic_150.csv')
X = df[["x"]]
y = df["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Linear Regression
lin = LinearRegression()
lin.fit(X_train, y_train)
y_pred_lin = lin.predict(X_test)
print("Linear R²:", r2_score(y_test, y_pred_lin))
print("Linear RMSE:", root_mean_squared_error(y_test, y_pred_lin))
# Polynomial (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_pred_poly = poly_model.predict(X_poly_test)

print("Polynomial coefficients:", poly_model.coef_)
print("Polynomial intercept:", poly_model.intercept_)
print("Polynomial R²:", r2_score(y_test, y_pred_poly))
print("Polynomial RMSE:", root_mean_squared_error(y_test, y_pred_poly))

# Predict y at x = 1.5
x_new = np.array([[1.5]])
x_new_poly = poly.transform(x_new)
print("Prediction at x=1.5:", poly_model.predict(x_new_poly))

Linear R²: 0.8904817310737732
Linear RMSE: 1.7497017806063875
Polynomial coefficients: [ 3.04036622 -0.70463416]
Polynomial intercept: 1.9542801042051434
Polynomial R²: 0.975208521028767
Polynomial RMSE: 0.832476469024934
Prediction at x=1.5: [4.92940258]


