<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Avantika/Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Lab Exercises: Embedded Feature‐Selection Methods
# Use the Breast Cancer dataset (sklearn.datasets.load_breast_cancer) for all exercises. Split once into training and test sets:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X.columns.tolist()
X.head()
y_train.value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,249
0,149


# Exercise 1: L1-Regularized Logistic Regression
- Fit LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000).

- Use SelectFromModel (prefit) to select the 5 features with nonzero coefficients.

- Retrain a vanilla logistic model on those features.

- Report selected features and test accuracy.

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

model_l1 = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000)
model_l1.fit(X_train, y_train)

sfm_l1 = SelectFromModel(model_l1, prefit=True, max_features=5)
feat_l1 = X_train.columns[sfm_l1.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_l1], y_train)
print("L1 features:", list(feat_l1))
print("Accuracy (L1):", accuracy_score(y_test, model.predict(X_test[feat_l1])))


L1 features: ['mean perimeter', 'area error', 'worst radius', 'worst perimeter', 'worst area']
Accuracy (L1): 0.9590643274853801


# Exercise 2: L2-Regularization with Thresholding
- Fit LogisticRegression(penalty='l2', C=1.0, max_iter=5000).

- Extract absolute coefficients, select the 5 largest.

- Retrain and evaluate on those features.

In [4]:
import numpy as np

model_l2 = LogisticRegression(penalty='l2', C=1.0, max_iter=5000)
model_l2.fit(X_train, y_train)

abs_coefs = np.abs(model_l2.coef_)[0]
idx = np.argsort(abs_coefs)[-5:]
feat_l2 = X_train.columns[idx]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_l2], y_train)
print("L2 features:", list(feat_l2))
print("Accuracy (L2):", accuracy_score(y_test, model.predict(X_test[feat_l2])))


L2 features: ['worst symmetry', 'worst compactness', 'mean radius', 'worst concavity', 'texture error']
Accuracy (L2): 0.9532163742690059


# Exercise 3: Elastic Net Regularization
- Standardize data with StandardScaler.

- Fit ElasticNetCV(l1_ratio=[.1, .5, .9], cv=5).

- Use SelectFromModel to pick the 5 nonzero–coefficient features.

- Retrain and evaluate.

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np

scaler = StandardScaler().fit(X_train)
X_tr_s = scaler.transform(X_train)
X_ts_s = scaler.transform(X_test)

# Increased max_iter for ElasticNetCV
enet = ElasticNetCV(l1_ratio=[.1, .5, .9], cv=5, random_state=0, max_iter=10000)
enet.fit(X_tr_s, y_train)

# Extract absolute coefficients and select the 5 largest
abs_coefs_enet = np.abs(enet.coef_)
# ElasticNetCV coef_ can be 2D for multi-output, ensure we work with 1D for this task
if abs_coefs_enet.ndim > 1:
    abs_coefs_enet = abs_coefs_enet[0] # Assuming single output for breast cancer data

idx_enet = np.argsort(abs_coefs_enet)[-5:]

# Use the indices to select columns from the scaled NumPy arrays
X_tr_selected_enet = X_tr_s[:, idx_enet]
X_ts_selected_enet = X_ts_s[:, idx_enet]

# Get the names of the selected features from the original DataFrame columns
feat_enet = X_train.columns[idx_enet]


model = LogisticRegression(max_iter=5000).fit(X_tr_selected_enet, y_train)
print("Elastic Net features:", list(feat_enet))
print("Accuracy (ElasticNet):", accuracy_score(y_test, model.predict(X_ts_selected_enet)))

Elastic Net features: ['radius error', 'mean compactness', 'mean concave points', 'worst area', 'worst radius']
Accuracy (ElasticNet): 0.9415204678362573
