In [None]:

# Lab Exercises: Embedded Feature‐Selection Methods
# Use the Breast Cancer dataset (sklearn.datasets.load_breast_cancer) for all exercises. Split once into training and test sets:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X.columns


Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

model_l1 = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter=5000)
model_l1.fit(X_train, y_train)

sfm_l1 = SelectFromModel(model_l1, prefit=True, max_features=5)
feat_l1 = X_train.columns[sfm_l1.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_l1], y_train)
print("L1 features:", list(feat_l1))
print("Accuracy (L1):", accuracy_score(y_test, model.predict(X_test[feat_l1])))


L1 features: ['mean perimeter', 'area error', 'worst radius', 'worst perimeter', 'worst area']
Accuracy (L1): 0.9590643274853801


In [None]:
import numpy as np

model_l2 = LogisticRegression(penalty='l2', C=1.0, max_iter=5000)
model_l2.fit(X_train, y_train)

abs_coefs = np.abs(model_l2.coef_)[0]
idx = np.argsort(abs_coefs)[-5:]
feat_l2 = X_train.columns[idx]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_l2], y_train)
print("L2 features:", list(feat_l2))
print("Accuracy (L2):", accuracy_score(y_test, model.predict(X_test[feat_l2])))
X_test

L2 features: ['worst symmetry', 'worst compactness', 'mean radius', 'worst concavity', 'texture error']
Accuracy (L2): 0.9532163742690059


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
204,12.470,18.60,81.09,481.9,0.09965,0.10580,0.08005,0.03821,0.1925,0.06373,...,14.97,24.64,96.05,677.9,0.1426,0.23780,0.2671,0.10150,0.3014,0.08750
70,18.940,21.31,123.60,1130.0,0.09009,0.10290,0.10800,0.07951,0.1582,0.05461,...,24.86,26.58,165.90,1866.0,0.1193,0.23360,0.2687,0.17890,0.2551,0.06589
131,15.460,19.48,101.70,748.9,0.10920,0.12230,0.14660,0.08087,0.1931,0.05796,...,19.26,26.00,124.90,1156.0,0.1546,0.23940,0.3791,0.15140,0.2837,0.08019
431,12.400,17.68,81.47,467.8,0.10540,0.13160,0.07741,0.02799,0.1811,0.07102,...,12.88,22.91,89.61,515.8,0.1450,0.26290,0.2403,0.07370,0.2556,0.09359
540,11.540,14.44,74.65,402.9,0.09984,0.11200,0.06737,0.02594,0.1818,0.06782,...,12.26,19.68,78.78,457.8,0.1345,0.21180,0.1797,0.06918,0.2329,0.08134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,12.780,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.1590,0.05653,...,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.06410
542,14.740,25.42,94.70,668.6,0.08275,0.07214,0.04105,0.03027,0.1840,0.05680,...,16.51,32.29,107.40,826.4,0.1060,0.13760,0.1611,0.10950,0.2722,0.06956
176,9.904,18.06,64.60,302.4,0.09699,0.12940,0.13070,0.03716,0.1669,0.08116,...,11.26,24.39,73.07,390.2,0.1301,0.29500,0.3486,0.09910,0.2614,0.11620
501,13.820,24.49,92.33,595.9,0.11620,0.16810,0.13570,0.06759,0.2275,0.07237,...,16.01,32.94,106.00,788.0,0.1794,0.39660,0.3381,0.15210,0.3651,0.11830


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV, LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Standardize the data
scaler = StandardScaler().fit(X_train)
X_tr_s = scaler.transform(X_train)
X_ts_s = scaler.transform(X_test)

# 2. Fit ElasticNetCV with increased iterations
enet = ElasticNetCV(
    l1_ratio=[.1, .5, .9],
    cv=5,
    max_iter=10000,
    tol=1e-4,
    random_state=0
)
enet.fit(X_tr_s, y_train)

# 3. Manually select the 5 nonzero coefficients with largest magnitude
coefs = enet.coef_  # shape (n_features,)
# Zero out truly zero (or near-zero) coefficients first
nonzero_idxs = np.where(np.abs(coefs) > 1e-8)[0]
# Sort those by absolute value descending
sorted_nonzero = nonzero_idxs[np.argsort(-np.abs(coefs[nonzero_idxs]))]
top5_idxs = sorted_nonzero[:5]

feat_enet = X_train.columns[top5_idxs]

# 4. Retrain Logistic Regression on selected features
model = LogisticRegression(max_iter=5000).fit(
    X_tr_s[:, top5_idxs],
    y_train
)

# 5. Evaluate on test data
accuracy = accuracy_score(
    y_test,
    model.predict(X_ts_s[:, top5_idxs])
)

print("Elastic Net features:", list(feat_enet))
print("Accuracy (ElasticNet):", accuracy)


Elastic Net features: ['worst radius', 'worst area', 'mean concave points', 'mean compactness', 'radius error']
Accuracy (ElasticNet): 0.9415204678362573


LAB-2

In [None]:
# Lab Exercises: Embedded Feature‐Selection Methods
# Use the Breast Cancer dataset (sklearn.datasets.load_breast_cancer) for all exercises. Split once into training and test sets:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

sfm_rf = SelectFromModel(rf, prefit=True, max_features=5, threshold=-np.inf)
feat_rf = X_train.columns[sfm_rf.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_rf], y_train)
print("RF features:", list(feat_rf))
print("Accuracy (RF):", accuracy_score(y_test, model.predict(X_test[feat_rf])))

RF features: ['mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst concave points']
Accuracy (RF): 0.9532163742690059


In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    n_estimators=100, eval_metric='logloss', random_state=0
)
xgb_clf.fit(X_train, y_train)

sfm_xgb = SelectFromModel(xgb_clf, prefit=True, max_features=5, threshold=-np.inf)
feat_xgb = X_train.columns[sfm_xgb.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_xgb], y_train)
print("XGB features:", list(feat_xgb))
print("Accuracy (XGB):", accuracy_score(y_test, model.predict(X_test[feat_xgb])))



XGB features: ['mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points']
Accuracy (XGB): 0.9590643274853801


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

# Parameters for stability selection
n_bootstraps    = 100
sample_fraction = 0.75
C               = 1.0

# Increase iterations and loosen tolerance
max_iter = 20000
tol      = 1e-3

n_features       = X_train.shape[1]
selection_counts = np.zeros(n_features, dtype=int)

for i in range(n_bootstraps):
    # a) bootstrap sample
    idx  = resample(
        np.arange(X_train.shape[0]),
        replace=True,
        n_samples=int(sample_fraction * X_train.shape[0]),
        random_state=i
    )
    X_bs = X_train.values[idx]
    y_bs = y_train.values[idx]

    # b) fit L1‐penalized logistic with more iterations and looser tol
    lr = LogisticRegression(
        penalty='l1',
        solver='saga',
        C=C,
        max_iter=max_iter,
        tol=tol,
        random_state=0
    )
    lr.fit(X_bs, y_bs)

    # c) tally nonzero coefficients
    nonzero = np.abs(lr.coef_)[0] > 1e-8
    selection_counts += nonzero.astype(int)

# Compute frequencies and pick top 5 stable features
selection_freq = selection_counts / n_bootstraps
top5_idx       = np.argsort(-selection_freq)[:5]
feat_rl        = X_train.columns[top5_idx]

# Retrain on the stable features
final_model = LogisticRegression(max_iter=5000).fit(X_train[feat_rl], y_train)
accuracy    = accuracy_score(y_test, final_model.predict(X_test[feat_rl]))

print("Stability features:", list(feat_rl))
print("Accuracy (Stability):", accuracy)


Stability features: ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean concavity']
Accuracy (Stability): 0.935672514619883


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=0)
dt.fit(X_train, y_train)

import numpy as np
idx_dt = np.argsort(dt.feature_importances_)[-5:]
feat_dt = X_train.columns[idx_dt]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_dt], y_train)
print("DT features:", list(feat_dt))
print("Accuracy (DT):", accuracy_score(y_test, model.predict(X_test[feat_dt])))


DT features: ['worst perimeter', 'worst area', 'worst radius', 'worst texture', 'mean concave points']
Accuracy (DT): 0.9707602339181286
