:

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset (continuation from Deliverable 1)
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (z-score normalization) to improve model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)

Train shape: (455, 30)
Test shape: (114, 30)


In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# Linear Regression (baseline)
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lin = lin_reg.predict(X_test_scaled)
r2_lin = r2_score(y_test, y_pred_lin)
mse_lin = mean_squared_error(y_test, y_pred_lin)
rmse_lin = np.sqrt(mse_lin)

print(f"Linear Regression - R2: {r2_lin:.4f}, MSE: {mse_lin:.4f}, RMSE: {rmse_lin:.4f}")

# Ridge Regression (L2 regularization, alpha=1.0)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
r2_ridge = r2_score(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)

print(f"Ridge - R2: {r2_ridge:.4f}, MSE: {mse_ridge:.4f}, RMSE: {rmse_ridge:.4f}")

# Lasso Regression (L1 regularization, alpha=0.1 for moderate sparsity)
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
r2_lasso = r2_score(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)

print(f"Lasso - R2: {r2_lasso:.4f}, MSE: {mse_lasso:.4f}, RMSE: {rmse_lasso:.4f}")

Linear Regression - R2: 0.7271, MSE: 0.0641, RMSE: 0.2532
Ridge - R2: 0.7359, MSE: 0.0621, RMSE: 0.2491
Lasso - R2: 0.6797, MSE: 0.0752, RMSE: 0.2743


In [None]:
from sklearn.model_selection import cross_val_score

# 5-Fold Cross-Validation for Linear Regression
cv_r2_lin = cross_val_score(lin_reg, X_train_scaled, y_train, cv=5, scoring='r2')
cv_mse_lin = -cross_val_score(lin_reg, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_lin = np.sqrt(cv_mse_lin)

print(f"Linear CV - Avg R2: {cv_r2_lin.mean():.4f}, Avg MSE: {cv_mse_lin.mean():.4f}, Avg RMSE: {cv_rmse_lin.mean():.4f}")

# For Ridge
cv_r2_ridge = cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='r2')
cv_mse_ridge = -cross_val_score(ridge, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_ridge = np.sqrt(cv_mse_ridge)

print(f"Ridge CV - Avg R2: {cv_r2_ridge.mean():.4f}, Avg MSE: {cv_mse_ridge.mean():.4f}, Avg RMSE: {cv_rmse_ridge.mean():.4f}")

# For Lasso
cv_r2_lasso = cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='r2')
cv_mse_lasso = -cross_val_score(lasso, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse_lasso = np.sqrt(cv_mse_lasso)

print(f"Lasso CV - Avg R2: {cv_r2_lasso.mean():.4f}, Avg MSE: {cv_mse_lasso.mean():.4f}, Avg RMSE: {cv_rmse_lasso.mean():.4f}")

Linear CV - Avg R2: 0.6925, Avg MSE: 0.0710, Avg RMSE: 0.2649
Ridge CV - Avg R2: 0.7217, Avg MSE: 0.0641, Avg RMSE: 0.2527
Lasso CV - Avg R2: 0.6291, Avg MSE: 0.0857, Avg RMSE: 0.2923


**Test Set Performance**
  - Linear Regression: R² = 0.7271 (explains 72.71% variance), MSE = 0.0641, RMSE = 0.2532.
  - Ridge: R² = 0.7359 (best on test), MSE = 0.0621, RMSE = 0.2491.
  - Lasso: R² = 0.6797, MSE = 0.0752, RMSE = 0.2743.

**Cross-Validation (5-Fold) Averages:**
  - Linear Regression: R² = 0.6925, MSE = 0.0710, RMSE = 0.2649.
  - Ridge: R² = 0.7217 (best generalization), MSE = 0.0641, RMSE = 0.2527.
  - Lasso: R² = 0.6291, MSE = 0.0857, RMSE = 0.2923.