In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets , linear_model
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")

# **Thresholding Numerical Feature Variance**

In [None]:
# import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [None]:
# View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

**Variance Thresholding (VT)**
$$
\text{Var}(X_j) = \frac{1}{n} \sum_{i=1}^{n} (x_{ij} - \bar{x}_j)^2
$$

Where:

- **X<sub>j</sub>**: the j-th feature (i.e., a column in your dataset)
- **x<sub>ij</sub>**: the value of the j-th feature for the i-th sample (i.e., a single cell in the column)
- **ȳ<sub>j</sub>**: the mean value of the j-th feature (i.e., average of that column)
- **n**: the total number of samples (i.e., number of rows)



**if the features have been standardized (to mean zero and unit variance), then
for obvious reasons variance thresholding will not work correctly**

# **Thresholding Binary Feature Variance**

In [None]:
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

For binary features (values are 0 or 1), the variance is calculated as:

$$
\text{Var}(X_j) = p_j(1 - p_j)
$$

Where:

- **p<sub>j</sub>** is the proportion of ones (1s) in feature **X<sub>j</sub>**

A feature is kept if:

$$
\text{Var}(X_j) = p_j(1 - p_j) > \text{threshold}
$$


# **Handling Highly Correlated Features**

In [None]:
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# Create correlation matrix
dataframe.corr().abs()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,0.034503
2,0.0,0.034503,1.0


# **Removing Irrelevant Features for Classification**

In [None]:
# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Convert to categorical data by converting data to integers
features = features.astype(int)

# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


**If the features are quantitative, compute the ANOVA F-value between each feature
and the target vector**

In [None]:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


**Instead of selecting a specific number of features, we can also use ```SelectPercentile``` Add blockquote to select the top n percent of features:**

In [None]:
# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


The Chi-square statistic is calculated as:

$$
\chi^2 = \sum_{i=1}^{n} \frac{(O_i - E_i)^2}{E_i}
$$

Where:

- **O<sub>i</sub>**: Observed frequency
- **E<sub>i</sub>**: Expected frequency


# **Recursively Eliminating Features**

In [None]:
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Create a linear regression
lr = linear_model.LinearRegression()

# Recursively eliminate features
rfecv = RFECV(estimator=lr, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [None]:
# Number of best features
rfecv.n_features_

np.int64(2)

In [None]:
# Which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [None]:
# Rank features best (1) to worst
rfecv.ranking_

array([22, 16, 91, 12, 52,  1, 96, 85, 37, 56, 82, 25, 44, 86, 51, 80,  4,
       95, 63, 84, 60,  3, 26, 89, 70, 61, 77, 88, 98, 19, 50, 39, 55, 53,
       92, 47, 99, 45, 28,  1, 41, 81, 90, 78, 74, 73, 64, 57, 93, 15, 72,
       32, 97, 48, 49, 66, 33, 43, 34, 59, 31, 68, 42, 14, 13, 46, 10, 65,
       36,  6, 58, 79, 20, 87,  2,  7, 23, 35, 38, 76,  8, 75, 17, 62, 67,
       69, 27, 71, 54, 29, 40,  9, 24, 18, 30,  5, 83, 94, 11, 21])

# **Quick Guide**

| Method                                  | Supervised? | Best For                                         | Use When                                                                  |
| --------------------------------------- | ----------- | ------------------------------------------------ | ------------------------------------------------------------------------- |
| **Variance Threshold**                  | ❌ No        | Removing features with little/no variation       | Your features are mostly numeric and you want to remove "useless" columns |
| **Binary Variance Threshold**           | ❌ No        | Removing binary features with low variability    | Your features are mostly 0s and 1s, and some rarely change                |
| **Correlation Thresholding**            | ❌ No        | Removing duplicated info due to high correlation | You want to remove redundant columns that provide the same information    |
| **SelectKBest (ANOVA, Chi2)**           | ✅ Yes       | Picking top features for classification tasks    | You want to rank features based on importance to the target class         |
| **RFE (Recursive Feature Elimination)** | ✅ Yes       | Model-driven, best subset selection              | You want the most important features selected by a specific model         |
