In [246]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE

In [247]:
out_dir = os.path.join(os.getcwd(), "../../out/feature_selection")

In [248]:
from sklearn.datasets import make_regression

X, y = make_regression(
    n_samples=100,
    n_features=100,
    n_informative=5,
    n_targets=1,
    random_state=42,
    noise=0.1
)

In [249]:
df = pd.DataFrame(X, columns=list(map(lambda x: f"x{x}", range(0, 100))))
df['target'] = y
# sns.pairplot(
#     df,
#     x_vars=['target']
# )
# plt.show()

In [250]:

model = LinearRegression()
print(f"Score: {-cross_val_score(model, X, y, cv=10, scoring='neg_root_mean_squared_error').mean():.2f}")

Score: 17.60


In [251]:

model = Pipeline([
    ('feature_selection', RFE(LinearRegression(), n_features_to_select=5)),
    ('regression', LinearRegression())
])

print(f"Score: {-cross_val_score(model, X, y, cv=10, scoring='neg_root_mean_squared_error').mean():.2f}")

Score: 0.09


## Inherent Feature Selection Property of Random Forest

Note: We use classification here as generated regression data is linear at therefore difficult for tree based models.

In [259]:
for n_samples in [50, 100, 1000]:
    for n_features in [10, 100, 500]:
        X, y = make_classification(
            n_samples=n_samples,
            n_features=n_features,
            n_informative=5,
            n_classes=2,
            n_clusters_per_class=1,
            random_state=42,
        )
        def get_score_for_model(model):
            return f"{cross_val_score(model, X, y, cv=10, scoring='accuracy').mean():.2}"
        get_score_for_model(RandomForestClassifier())
        print("******")
        print(f"{n_features=}, {n_samples=}")
        print(f"RandomForestClassifier: {get_score_for_model(RandomForestClassifier())}")
        print(f"LogisticRegression: {get_score_for_model(LogisticRegression())}")
        fs_lr = Pipeline([
            ('feature_selection', RFE(LogisticRegression(), n_features_to_select=10)),
            ('regression', LogisticRegression())
        ])
        print(f"Feature Selection+LogisticRegression: {get_score_for_model(fs_lr)}")
        print("******")

******
n_features=10, n_samples=50
RandomForestClassifier: 0.82
LogisticRegression: 0.86
Feature Selection+LogisticRegression: 0.86
******
******
n_features=100, n_samples=50
RandomForestClassifier: 0.68
LogisticRegression: 0.62
Feature Selection+LogisticRegression: 0.78
******
******
n_features=500, n_samples=50
RandomForestClassifier: 0.78
LogisticRegression: 0.68
Feature Selection+LogisticRegression: 0.82
******
******
n_features=10, n_samples=100
RandomForestClassifier: 0.97
LogisticRegression: 0.97
Feature Selection+LogisticRegression: 0.97
******
******
n_features=100, n_samples=100
RandomForestClassifier: 0.96
LogisticRegression: 0.89
Feature Selection+LogisticRegression: 0.95
******
******
n_features=500, n_samples=100
RandomForestClassifier: 0.94
LogisticRegression: 0.87
Feature Selection+LogisticRegression: 0.9
******
******
n_features=10, n_samples=1000
RandomForestClassifier: 0.98
LogisticRegression: 0.98
Feature Selection+LogisticRegression: 0.98
******
******
n_features=1