<a href="https://colab.research.google.com/github/arijit190805/DS-EXP/blob/main/DS_EXP_8_(US).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

. Sample Data


In [None]:
import pandas as pd
from sklearn.datasets import make_classification

# Create a sample dataset
X, y = make_classification(n_samples=100, n_features=20, n_informative=10, n_redundant=5, random_state=42)
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
df['target'] = y


Wrapper Methods (Backward Elimination)

In [None]:
import statsmodels.api as sm

# Function for backward elimination
def backward_elimination(X, y, significance_level=0.05):
    X = sm.add_constant(X)  # Add intercept
    model = sm.OLS(y, X).fit()

    while True:
        max_p_value = model.pvalues.max()
        if max_p_value >= significance_level:
            # Drop feature with the highest p-value
            feature_to_drop = model.pvalues.idxmax()
            X = X.drop(columns=feature_to_drop)
            model = sm.OLS(y, X).fit()
        else:
            break

    return X

# Perform backward elimination
X_selected = backward_elimination(df.drop(columns='target'), df['target'])
print(X_selected.columns)


Index(['const', 'feature_0', 'feature_2', 'feature_5', 'feature_6',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_15'],
      dtype='object')


Embedded Methods (Using Decision Trees)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Fit a Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X, y)

# Get feature importances
importances = tree_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': df.columns[:-1], 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))


       Feature  Importance
4    feature_4    0.385789
2    feature_2    0.136133
0    feature_0    0.110008
19  feature_19    0.066443
13  feature_13    0.060000
6    feature_6    0.059524
12  feature_12    0.057143
7    feature_7    0.051818
10  feature_10    0.038857
8    feature_8    0.034286
3    feature_3    0.000000
5    feature_5    0.000000
9    feature_9    0.000000
1    feature_1    0.000000
11  feature_11    0.000000
14  feature_14    0.000000
15  feature_15    0.000000
16  feature_16    0.000000
17  feature_17    0.000000
18  feature_18    0.000000


Lasso Regularization

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)

# Get selected features
selected_features = pd.Series(lasso.coef_, index=df.columns[:-1])
print(selected_features[selected_features != 0])


feature_0    -0.114809
feature_1    -0.010935
feature_2     0.004274
feature_4    -0.070635
feature_7     0.003396
feature_10    0.037549
dtype: float64
