In [59]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from scipy import stats

**Filter Method**

The filter method for feature selection involves ranking features based on statistical measures and selecting the top-ranked features.

1. Correlation Coefficient
2. Hypothesis test
3. Variance Threshold

1. Correlation Coefficent

In [60]:
feature1 = np.random.normal(100, 10, 50)
feature2 = np.random.normal(150, 20, 50) + feature1
feature3 = 0.1 * feature2 * np.random.normal(200, 30, 50) + feature1
data = pd.DataFrame(
    {'feature1': feature1, 'feature2': feature2, 'feature3': feature3})

In [61]:
data.head()

Unnamed: 0,feature1,feature2,feature3
0,104.170111,263.327813,5021.738522
1,108.324619,224.638871,4544.38179
2,97.066009,253.602913,6110.256907
3,99.701614,248.079235,3999.646946
4,100.951258,260.307153,5571.063464


In [62]:
data.corr()

Unnamed: 0,feature1,feature2,feature3
feature1,1.0,0.433909,0.306731
feature2,0.433909,1.0,0.67456
feature3,0.306731,0.67456,1.0


In [63]:
selected_cols = data.columns[data.corr()['feature3'] > 0.5]

In [64]:
data[selected_cols].head()

Unnamed: 0,feature2,feature3
0,263.327813,5021.738522
1,224.638871,4544.38179
2,253.602913,6110.256907
3,248.079235,3999.646946
4,260.307153,5571.063464


2. Hypothesis test

In [114]:
np.random.seed(42)
data = {
    'feature1': np.random.choice([1, 2, 3], size=12),
    'feature2': np.random.choice([10, 20, 30], size=12),
    'feature3': np.random.choice([200, 500], size=12),
    'target': np.random.choice(['A', 'B'], size=12)
}
df = pd.DataFrame(data)

In [115]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


In [116]:
X = df.drop('target', axis=1)
y = df['target']

SelectkBest: Select the k best features based on the statistical measure. <br> more info: https://medium.com/@Kavya2099/optimizing-performance-selectkbest-for-efficient-feature-selection-in-machine-learning-3b635905ed48

In [68]:
chi_selector = SelectKBest(score_func=chi2, k=1)
kbest = chi_selector.fit(X, y)

In [69]:
kbest.scores_

array([0.03448276, 0.47619048, 0.        ])

In [70]:
kbest.pvalues_

array([0.85268368, 0.49015296, 1.        ])

In [71]:
X.columns[kbest.get_support()]

Index(['feature2'], dtype='object')

In [72]:
print("Selected Features:\n", X.columns[chi_selector.get_support()])

Selected Features:
 Index(['feature2'], dtype='object')


In [73]:
chi_selector

In [74]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


Scratch Implementation

In [75]:
chi_data = pd.crosstab(index=df['feature1'], columns=df['target'])
chi_data

target,A,B
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,1,0
3,4,4


Function to calculate p value

In [76]:
def calculate_p(chi_value, df):
    return (1 - stats.chi2.cdf(chi_value, df))

Function for calculating chi-square value

In [77]:
def find_chivalue(X, y):
    '''Function that takes frequency dataframe as input and calculate chisquare value and return it'''
    chi_values = []
    p_values = []
    for col in X:
        chi_data = pd.crosstab(index=X[col], columns=y)
        data = chi_data.values  # convert dataframe into array for better calculation
        # reshape for dot multiplication
        row_sum = data.sum(axis=1).reshape(-1, 1)
        col_sum = data.sum(axis=0)
        # total sum.
        grand_total = row_sum.sum()
        # calculate expected frequency(rowij * colji / grand_total)
        exp = (row_sum * col_sum) / grand_total
        # calculate chi value (sum(observed - expected)**2 / expected)
        chi_value = sum(sum((data - exp) ** 2 / exp))
        df = (chi_data.shape[0] * chi_data.shape[1]) - 1
        p_values.append(calculate_p(chi_value, df))
        chi_values.append((col, chi_value))
    return chi_values, p_values

Function to select best feature based on chi-square value

In [78]:
def best_feature(chi_values, k):
    '''Funcion to find best k features based on chi square value
        Takes chi values: List, Contains features and  chi square value as tuples
        k: int, No of features to select
        return best features with its chi-square value'''
    # Sort in descending order chi values based on chi-square
    sorted_chivalues = sorted(chi_values, key=lambda x: x[1], reverse=True)
    return sorted_chivalues[:k]

In [79]:
chi_values, p_values = find_chivalue(X, y)
print(p_values)
best_feature(chi_values, 2)

[0.9314646171334656, 0.9502405577506894, 1.0]


[('feature1', 1.3333333333333333), ('feature2', 1.1428571428571428)]

In [80]:
import numpy as np
from scipy.stats import chi2_contingency


# Perform Chi-Square test with correction
chi2, p, dof, expected = chi2_contingency(chi_data)

print("Chi-Square Statistic:", chi2)
print("P-Value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

Chi-Square Statistic: 1.3333333333333333
P-Value: 0.5134171190325922
Degrees of Freedom: 2
Expected Frequencies:
 [[1.5 1.5]
 [0.5 0.5]
 [4.  4. ]]


In [81]:
chi_data

target,A,B
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,1,0
3,4,4


3. Variance Threshold

This method calculate varaince of each feature independently and remove the feature with less variance(because it dosenot have high impact on target value).

It have disadvantage like sometimes it doesnot work well with categorical value and discreate value.Also it calculate importance based on that feature only dont calculate combined effect or its effect with target value. Also try to avoid it with skewed data, because it have less variance but crucial.

In [82]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


In [128]:
class VarianceThreshold:
    def __init__(self, threshold) -> None:
        '''Take one argument threshold: float'''
        self.threshold = threshold

    def __calculate_variance(self):
        '''Calculate Variance for input feature'''
        if not hasattr(self.X, 'columns') or not hasattr(self.X, "__getitem__"):
            raise TypeError("X must be a Dataframe")
        return self.X.var(axis=0)

    def __variance_threshold(self):
        '''Select features based on threshold'''
        return [var for var in self.__calculate_variance().items() if var[1] > self.threshold]

    def fit(self, X):
        '''This function takes features as arument and call calculate
        variance function and variance threshold to find best features
        and retunn it as tuple'''
        if not hasattr(X, "columns") or not hasattr(X, "__getitem__"):
            raise ValueError(
                "Input X must be a DataFrame-like object with columns.")
        self.X = X
        selected_features = self.__variance_threshold()
        return selected_features

In [129]:
var_treshold = VarianceThreshold(1)
var_treshold.fit(X)

[('feature2', 38.63636363636363), ('feature3', 24545.454545454544)]

In [None]:
var_treshold = VarianceThreshold(1)
var_treshold.fit(X)

[('feature2', 38.63636363636363), ('feature3', 24545.454545454544)]

**Wrapper Method**

Evaluate on a one specific machine learning algorithm to find optimal features.

1. Forward Selection
2. Backward Selection
3. RFE

Forward Selection

In [85]:
from sklearn.datasets import load_diabetes
data = load_diabetes()

In [86]:
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [87]:
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [88]:
features = pd.DataFrame(data['data'],columns=data['feature_names'])
target = pd.Series(data['target'],name='diabetes')

In [89]:
features.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [90]:
target.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: diabetes, dtype: float64

Function to find best features using forward feature selection

Steps:<br>
1. initialize empty feature set and a model
2. select one feature and train model
3. select the feature that give high performance
4. repeat the step 2 and 3 until the criteria met(max no of featuers we want, adding new features does not increase performance)
5. return the selected features

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

Function for forward feature selection

In [None]:
def forward_feature_selection(model, X, y):
    '''Function for feature selection
        Input:
        model: Machine Learning Model
        X: DataFrame
        y: Series
        Return:
        remaining_features  name: List
    '''
    # Assign columns name into features
    remaining_features = list(X.columns)
    # current selected feature
    selected_feature = []
    # To store highest score in the previous iteration of while loop
    high_score = 0
    while len(remaining_features) > 0:
        current_features = list(selected_feature)
        scores = []
        # Iterate through all the columns in the remaining_features  for finding the best one
        for feature in remaining_features:
            current_features.append(feature)
            cv = cross_val_score(estimator=model, X=X[current_features], y=y,scoring='r2')
            scores.append((feature, cv.mean()))
            current_features.pop()
        # find the best score given feature
        best_feature = max(scores,key=lambda x:x[1])
        print(best_feature)
        # if high score is have an increase(More than 0.005) then select that featuer
        if high_score + 0.005 < best_feature[1]:
            selected_feature.append(best_feature[0])
            remaining_features.remove(best_feature[0])
            high_score = best_feature[1]
        # Otherwise exist from the while loop
        else:
            break
    return selected_feature

In [93]:
rf = RandomForestRegressor()

In [94]:
selected_features = forward_feature_selection(rf,features,target)
selected_features

('bmi', 0.1496697875122412)
('s5', 0.32367355085831695)
('bp', 0.35460168754172605)
('s6', 0.39139029755561106)
('age', 0.39993861084685284)
('s2', 0.3964362674802423)


['bmi', 's5', 'bp', 's6', 'age']

In [95]:
rf.fit(features,target)
rf.score(features,target)

0.9184995450002115

In [96]:
rf = RandomForestRegressor()
rf.fit(features[selected_features],target)
rf.score(features[selected_features],target)

0.9174561078849018

Function for backward feature selection

In [97]:
def backward_feature_elimination(model, X, y, tolerance=0.005):
    '''Function for select features by backward elimination
        Input:
        model: scikit-learn compatible estimator
        X: Dataframe
        y: Series
        tolerance (float): Minimum score improvement to remove a feature (default=0.005)
        Output:
        List of selected feature names.
        '''
    selected_features = list(X.columns)
    cv = cross_val_score(estimator=model, X=X, y=y, cv=5)
    high_score = cv.mean()
    while len(selected_features) >= 1:
        scores = []
        current_features = selected_features.copy()
        for feature in selected_features:
            current_features.remove(feature)
            cv = cross_val_score(
                estimator=model, X=X[current_features], y=y, cv=5)
            scores.append((feature, cv.mean()))
            current_features.append(feature)
        worst_feature, worst_score = max(scores, key=lambda x: x[1])
        if high_score + tolerance < worst_score:
            selected_features.remove(worst_feature)
            high_score = worst_score
        else:
            break
    return selected_features

In [98]:
selected_features = backward_feature_elimination(rf,features,target)
selected_features

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's4', 's5', 's6']

In [99]:
rf.fit(features[selected_features],target)
rf.score(features[selected_features],target)

0.9252471365423877

RFE(Recursive Feature Elimination)

Select features recursively using model feature importance(coeff or feature importance)

In [100]:
features.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [101]:
def recursive_feature_elimination(model, X, y, n):
    '''Function to select best features using model feature importance
    Args:
        Input:
        model: sckit-learn compatible estimator
        X: Dataframe
        y: Series
        n: no of features want to select
        Ouput:
        List of selected features
    '''
    selected_features = list(X.columns)
    while n < len(selected_features):
        model.fit(X[selected_features], y)
        feature_imp = model.feature_importances_
        worst_feature = selected_features[feature_imp.argmin()]
        selected_features.remove(worst_feature)
        print(f"Removed feature: {worst_feature}")
    return selected_features

In [102]:
recursive_feature_elimination(XGBRegressor(),features,target,5)

Removed feature: age
Removed feature: sex
Removed feature: s4
Removed feature: s1
Removed feature: s2


['bmi', 'bp', 's3', 's5', 's6']

In [103]:
recursive_feature_elimination(RandomForestRegressor(),features,target,5)

Removed feature: sex
Removed feature: s4
Removed feature: s1
Removed feature: s3
Removed feature: age


['bmi', 'bp', 's2', 's5', 's6']

**Embedded**

 Embedded methods strike a balance between the advantages of filter and wrapper methods. They depend on algorithms that naturally incorporate feature selection during training.

1. Lasso Regression(l1 regularization)
2. Ridge Regression(l2 regularization)
3. Elastic Net

Lasso Regression

In [104]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet

In [105]:
lasso = Lasso(alpha=0.2)
lasso.fit(features,target)

important_features = [i for i, coef in enumerate(lasso.coef_) if coef !=  0.00000000e+00 and coef!=0]
print("Selected Features:", important_features)

Selected Features: [1, 2, 3, 6, 8, 9]


In [106]:
lasso.coef_

array([ 0.00000000e+00, -7.56121328e+01,  5.11404133e+02,  2.34508645e+02,
       -0.00000000e+00, -0.00000000e+00, -1.70214828e+02,  0.00000000e+00,
        4.50678492e+02,  2.24851976e-01])

Ridge

In [107]:
ridge = Ridge(alpha=0.1)
ridge.fit(features,target)
important_features = [i for i,coef in enumerate(ridge.coef_) if coef != 0]
print("Selected Features:", important_features)

Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [108]:
ridge.coef_

array([   1.30870543, -207.19241786,  489.69517109,  301.76405786,
        -83.46603399,  -70.8268319 , -188.67889782,  115.7121356 ,
        443.81291747,   86.7493154 ])

Elastic Net

In [109]:
elasticnet = ElasticNet(alpha=0.1)
elasticnet.fit(features,target)
important_featuress = [i for i,coef in enumerate(elasticnet.coef_) if coef!=0]
print("Selected Features:", important_features)

Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [110]:
elasticnet.coef_

array([ 10.28633208,   0.28598338,  37.46465473,  27.54476518,
        11.10885591,   8.3558916 , -24.12080871,  25.50549197,
        35.46569979,  22.89498509])

Hybrid Feature Selection

Hybrid methods combine the strengths of filter, wrapper, and embedded methods, leveraging their complementary advantages to achieve better results.

1. Filter Methods:<br>
Efficient but do not consider feature interactions. <br>
E.g., Chi-Square, Mutual Information.<br>
2. Wrapper Methods:<br>
Accurate but computationally expensive.<br>
E.g., Forward Feature Selection, Recursive Feature Elimination.<br>
3. Embedded Methods:<br>
Model-specific and less flexible.<br>
E.g., LASSO, Decision Trees.<br>

In [111]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

# Step 1: Generate synthetic dataset
# 1000 samples, 50 features, 10 informative features
X, y = make_classification(
    n_samples=1000, n_features=50, n_informative=10, n_redundant=5, random_state=42
)
data = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(1, 51)])

# Step 2: Filter Method - Remove low variance features
print("Step 2: Applying Filter Method (Variance Threshold)...")
var_thresh = VarianceThreshold(threshold=0.01)  # Low variance threshold
X_filtered = var_thresh.fit_transform(X)

print(f"Shape after variance thresholding: {X_filtered.shape}")

# Step 3: Filter Method - Select features based on Mutual Information
print("\nStep 3: Selecting Top Features Based on Mutual Information...")
mi_scores = mutual_info_classif(X_filtered, y)
mi_threshold = 0.02
selected_features = [
    i for i, score in enumerate(mi_scores) if score > mi_threshold
]
X_mi_selected = X_filtered[:, selected_features]

print(f"Shape after mutual information filtering: {X_mi_selected.shape}")

# Step 4: Wrapper Method - Recursive Feature Elimination (RFE)
print("\nStep 4: Applying Wrapper Method (RFE)...")
model = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=model, n_features_to_select=10)  # Select top 10 features
X_rfe_selected = rfe.fit_transform(X_mi_selected, y)

print(f"Shape after RFE: {X_rfe_selected.shape}")

# Step 5: Embedded Method - Feature Importance from Random Forest
print("\nStep 5: Evaluating with Embedded Method (Random Forest)...")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_rfe_selected, y)
feature_importances = rf_model.feature_importances_

# Select top 5 most important features
important_indices = sorted(
    range(len(feature_importances)),
    key=lambda i: feature_importances[i],
    reverse=True
)[:5]
X_final = X_rfe_selected[:, important_indices]

print(f"Final selected features shape: {X_final.shape}")

# Step 6: Model Validation
print("\nStep 6: Validating the Final Features...")
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
final_model = LogisticRegression(max_iter=1000, random_state=42)
scores = cross_val_score(final_model, X_train, y_train, cv=5)

print(f"Cross-validated accuracy: {scores.mean():.2f}")


Step 2: Applying Filter Method (Variance Threshold)...
Shape after variance thresholding: (1000, 50)

Step 3: Selecting Top Features Based on Mutual Information...
Shape after mutual information filtering: (1000, 11)

Step 4: Applying Wrapper Method (RFE)...
Shape after RFE: (1000, 10)

Step 5: Evaluating with Embedded Method (Random Forest)...
Final selected features shape: (1000, 5)

Step 6: Validating the Final Features...
Cross-validated accuracy: 0.76
