# Hierarchical selector
Example of hierarchical selection on a fake dataset

In [7]:
import sys
sys.path.append("..")
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

from src.hierarchical_cluster_selector import HierachicalClusterSelector


In [13]:
SEED = 98234
# Generate a highly correlated dataset and split data
X, y = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=2,
    n_redundant=4,
    n_classes=2,
    random_state=SEED
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)

In [14]:
# fit a simple logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9090909090909091

In [15]:
# fit a pipeline with feature selection
clf = Pipeline([
    ('selector', HierachicalClusterSelector(k=4, random_state=SEED)),
    ('lr', LogisticRegression())
])
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.49696969696969695

In [16]:
# Show support
clf.steps[0][-1].get_support()

array([ True,  True, False, False, False,  True, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False])

In [18]:
# Execute grid search to find the best k/criterion
g = GridSearchCV(
    estimator=clf,
    param_grid={
        'selector__k': [3, 4, 5, 6],
        'selector__criterion': ['ward', 'single']
    }
)
g.fit(X_train, y_train)
g.best_estimator_

g.score(X_test, y_test)

0.9272727272727272