In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from scipy import stats

**Filter Method**

The filter method for feature selection involves ranking features based on statistical measures and selecting the top-ranked features.

1. Correlation Coefficient
2. Hypothesis test
3. Variance Threshold

1. Correlation Coefficent

In [14]:
feature1 = np.random.normal(100, 10, 50)
feature2 = np.random.normal(150, 20, 50) + feature1
feature3 = 0.1 * feature2 * np.random.normal(200, 30, 50) + feature1
data = pd.DataFrame(
    {'feature1': feature1, 'feature2': feature2, 'feature3': feature3})

In [15]:
data.head()

Unnamed: 0,feature1,feature2,feature3
0,87.805255,210.52743,3825.86493
1,102.270155,235.276762,3965.254338
2,106.554736,264.974106,4332.604644
3,118.053356,251.047542,4961.067453
4,104.987176,251.550633,5358.633668


In [16]:
data.corr()

Unnamed: 0,feature1,feature2,feature3
feature1,1.0,0.454186,0.174196
feature2,0.454186,1.0,0.4335
feature3,0.174196,0.4335,1.0


In [17]:
selected_cols = data.columns[data.corr()['feature3'] > 0.5]

In [18]:
data[selected_cols].head()

Unnamed: 0,feature3
0,3825.86493
1,3965.254338
2,4332.604644
3,4961.067453
4,5358.633668


2. Hypothesis test

In [19]:
np.random.seed(42)
data = {
    'feature1': np.random.choice([1, 2, 3], size=12),
    'feature2': np.random.choice([10, 20, 30], size=12),
    'feature3': np.random.choice([200, 500], size=12),
    'target': np.random.choice(['A', 'B'], size=12)
}
df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


In [21]:
X = df.drop('target', axis=1)
y = df['target']

In [22]:
chi_selector = SelectKBest(score_func=chi2, k=1)
kbest = chi_selector.fit(X, y)

In [23]:
kbest.scores_

array([0.03448276, 0.47619048, 0.        ])

In [24]:
kbest.pvalues_

array([0.85268368, 0.49015296, 1.        ])

In [25]:
X.columns[kbest.get_support()]

Index(['feature2'], dtype='object')

In [26]:
print("Selected Features:\n", X.columns[chi_selector.get_support()])

Selected Features:
 Index(['feature2'], dtype='object')


In [27]:
chi_selector

In [28]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


Scratch Implementation

In [29]:
chi_data = pd.crosstab(index=df['feature1'], columns=df['target'])
chi_data

target,A,B
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,1,0
3,4,4


Function to calculate p value

In [30]:
def calculate_p(chi_value, df):
    return (1 - stats.chi2.cdf(chi_value, df))

Function for calculating chi-square value

In [31]:
def find_chivalue(X, y):
    '''Function that takes frequency dataframe as input and calculate chisquare value and return it'''
    chi_values = []
    p_values = []
    for col in X:
        chi_data = pd.crosstab(index=X[col], columns=y)
        data = chi_data.values  # convert dataframe into array for better calculation
        # reshape for dot multiplication
        row_sum = data.sum(axis=1).reshape(-1, 1)
        col_sum = data.sum(axis=0)
        # total sum.
        grand_total = row_sum.sum()
        # calculate expected frequency(rowij * colji / grand_total)
        exp = (row_sum * col_sum) / grand_total
        # calculate chi value (sum(observed - expected)**2 / expected)
        chi_value = sum(sum((data - exp) ** 2 / exp))
        df = (chi_data.shape[0] * chi_data.shape[1]) - 1
        p_values.append(calculate_p(chi_value, df))
        chi_values.append((col, chi_value))
    return chi_values, p_values

Function to select best feature based on chi-square value

In [32]:
def best_feature(chi_values, k):
    '''Funcion to find best k features based on chi square value
        Takes chi values: List, Contains features and  chi square value as tuples
        k: int, No of features to select
        return best features with its chi-square value'''
    # Sort in descending order chi values based on chi-square
    sorted_chivalues = sorted(chi_values, key=lambda x: x[1], reverse=True)
    return sorted_chivalues[:k]

In [33]:
chi_values, p_values = find_chivalue(X, y)
print(p_values)
best_feature(chi_values, 2)

[0.9314646171334656, 0.9502405577506894, 1.0]


[('feature1', 1.3333333333333333), ('feature2', 1.1428571428571428)]

In [34]:
import numpy as np
from scipy.stats import chi2_contingency


# Perform Chi-Square test with correction
chi2, p, dof, expected = chi2_contingency(chi_data)

print("Chi-Square Statistic:", chi2)
print("P-Value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

Chi-Square Statistic: 1.3333333333333333
P-Value: 0.5134171190325922
Degrees of Freedom: 2
Expected Frequencies:
 [[1.5 1.5]
 [0.5 0.5]
 [4.  4. ]]


In [35]:
chi_data

target,A,B
feature1,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,1,0
3,4,4


3. Variance Threshold

This method calculate varaince of each feature independently and remove the feature with less variance(because it dosenot have high impact on target value).

It have disadvantage like sometimes it doesnot work well with categorical value and discreate value.Also it calculate importance based on that feature only dont calculate combined effect or its effect with target value. Also try to avoid it with skewed data, because it have less variance but crucial.

In [36]:
df

Unnamed: 0,feature1,feature2,feature3,target
0,3,10,200,A
1,1,30,500,B
2,3,20,200,B
3,3,10,200,A
4,1,20,200,B
5,1,20,200,A
6,3,20,200,B
7,2,20,500,A
8,3,10,500,B
9,3,10,500,B


In [37]:
class VarianceThreshold:
    def __init__(self, threshold) -> None:
        '''Take one argument threshold: float'''
        self.threshold = threshold

    def __calculate_variance(self, X):
        '''Calculate Variance for input feature'''
        if not hasattr(X, 'columns') or not hasattr(X, "__getitem__"):
            raise TypeError("X must be a Dataframe")
        return X.var(axis=0)

    def __variance_threshold(self, variances):
        '''Select features based on threshold'''
        return [var for var in variances.items() if var[1] > self.threshold]

    def fit(self, X):
        '''This function takes features as arument and call calculate
        variance function and variance threshold to find best features
        and retunn it as tuple'''
        if not hasattr(X, "columns") or not hasattr(X, "__getitem__"):
            raise ValueError(
                "Input X must be a DataFrame-like object with columns.")
        self.X = X
        variances = self.__calculate_variance(X)
        selected_features = self.__variance_threshold(variances)
        return selected_features

In [38]:
var_treshold = VarianceThreshold(1)
var_treshold.fit(X)

[('feature2', 38.63636363636363), ('feature3', 24545.454545454544)]

**Wrapper Method**

Evaluate on a one specific machine learning algorithm to find optimal features.

1. Forward Selection
2. Backward Selection
3. RFE

Forward Selection

In [39]:
from sklearn.datasets import load_diabetes
data = load_diabetes()

In [40]:
data

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [41]:
data.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [42]:
features = pd.DataFrame(data['data'],columns=data['feature_names'])
target = pd.Series(data['target'],name='diabetes')

In [43]:
features.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [44]:
target.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: diabetes, dtype: float64

Function to find best features using forward feature selection

Steps:<br>
1. initialize empty feature set and a model
2. select one feature and train model
3. select the feature that give high performance
4. repeat the step 2 and 3 until the criteria met(max no of featuers we want, adding new features does not increase performance)
5. return the selected features

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

Function for forward feature selection

In [None]:
def forward_feature_selection(X, y):
    model = LinearRegression()
    features = list(X.columns)
    selected_feature = []
    high_score = 0
    while len(features) > 0:
        train_features = list(selected_feature)
        scores = []
        for feature in features:
            train_features.append(feature)
            cv = cross_val_score(estimator=model, X=X[train_features], y=y)
            scores.append((feature, cv.mean()))
            train_features.pop()
        best_feature = sorted(scores, key=lambda x: x[1], reverse=True)[0]
        if high_score + 0.005 < best_feature[1]:
            selected_feature.append(best_feature[0])
            features.remove(best_feature[0])
            high_score = best_feature[1]
        else:
            break
    return selected_feature

In [108]:
forward_feature_selection(features,target)

['bmi', 's5', 'bp', 's3', 'sex']

In [85]:
features.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')