In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler

# 1. Define key columns
exclude_cols = ['title', 'fev1.fvc.ratio', 'gender', 'age', 'disease', 'smoking.status', 
                'race', 'percent.emphysema', 'pack.years', 'sample', 'disease_group']

# 2. Identify candidate feature columns (gene columns)
feature_cols = [col for col in l2p_merged.columns if col not in exclude_cols]

# 3. Compute variance for each candidate feature
feature_variances = l2p_merged[feature_cols].var(axis=0)

# 4. Select top 1% most variable columns
top_k = max(1, int(0.01 * len(feature_cols)))  # at least 1 feature
top_features = feature_variances.sort_values(ascending=False).index[:top_k].tolist()

# 5. Combine selected gene features with clinical covariates you want to always include
#    (excluding the outcome/target col)
always_include = ['gender', 'age', 'race']
all_features = always_include + top_features


p2l_merged_clean = p2l_merged.dropna(subset=['fev1.fvc.ratio'])

# 6. Prepare X, y
X = p2l_merged_clean[all_features]
y = p2l_merged_clean['fev1.fvc.ratio']



# 7. OPTIONAL: handle categorical variables (e.g., one-hot encoding)
X = pd.get_dummies(X, drop_first=True)

# 8. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 9. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# 10. Fit Elastic Net with cross-validated regularization
enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],  # Try several l1/l2 mixes
                    n_alphas=100,
                    cv=5,
                    max_iter=5000,
                    random_state=42)
enet.fit(X_train, y_train)

feature_names = X.columns  # These are in the same order as enet.coef_

# Create a DataFrame of feature names and coefficients
coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': enet.coef_
})

# Only keep nonzero coefficients (selected by Elastic Net)
nonzero_coef = coef_df[coef_df['coefficient'] != 0]

# Rank by absolute value (importance)
ranked = nonzero_coef.reindex(nonzero_coef.coefficient.abs().sort_values(ascending=False).index)

print(ranked.head(20))

# 11. Evaluate performance (e.g., R2 score)
r2 = enet.score(X_test, y_test)
print(f"Test R2: {r2:.3f}")
print("Best l1_ratio:", enet.l1_ratio_)
print("Best alpha:", enet.alpha_)