In [1]:
import random

import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import f_classif, f_regression
from sklearn.preprocessing import StandardScaler

from pipecaster.pipeline import Pipeline
from pipecaster.input_selection import SelectKBestInputs
from pipecaster import synthetic_data

In [2]:
def _select_synthetic_regression(input_selector, n_informative_Xs=5, n_weak_Xs=0, n_random_Xs=0, 
                                 weak_noise_sd=None, verbose = 1, seed = None, **sklearn_params):

    n_Xs =  n_informative_Xs + n_weak_Xs + n_random_Xs
    Xs, y, X_types = synthetic_data.make_multi_input_regression(n_informative_Xs, n_weak_Xs,
                                                     n_random_Xs, weak_noise_sd, seed, 
                                                     **sklearn_params)

    clf = Pipeline(n_inputs = n_Xs)
    layer0 = clf.get_next_layer()
    layer0[:] = StandardScaler()
    layer1 = clf.get_next_layer()
    layer1[:] = input_selector
    Xs_t = clf.fit_transform(Xs, y)
    Xs_selected = ['selected' if X is not None else 'not selected' for X in Xs_t]

    n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0
    for X, t in zip(Xs_selected, X_types):
        if X == 'selected' and t == 'informative':
            n_informative_hits +=1
        if X == 'not selected' and t == 'random':
            n_random_hits +=1
        if X == 'selected' and t == 'weak':
            n_weak_hits +=1

    if verbose > 0:
        print('InputSelector selected {} out of {} informative inputs'
              .format(n_informative_hits, n_informative_Xs))
        print('InputSelector filtered out {} out of {} random inputs'
              .format(n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs))   
        print('InputSelector selected out {} out of {} weakly informative inputs'
              .format(n_weak_hits, n_weak_Xs))

    return n_informative_hits, n_random_hits, n_weak_hits

In [88]:
def _select_synthetic_regression(input_selector, n_informative_Xs=5, n_weak_Xs=0, n_random_Xs=0, 
                                 weak_noise_sd=None, verbose = 1, seed = None, **sklearn_params):

    n_Xs =  n_informative_Xs + n_weak_Xs + n_random_Xs
    Xs, y, X_types = synthetic_data.make_multi_input_regression(n_informative_Xs, n_weak_Xs,
                                                                n_random_Xs, weak_noise_sd, 
                                                                seed, **sklearn_params)
    clf = Pipeline(n_inputs = n_Xs)
    layer0 = clf.get_next_layer()
    layer0[:] = StandardScaler()
    layer1 = clf.get_next_layer()
    layer1[:] = input_selector
    Xs_t = clf.fit_transform(Xs, y)
    Xs_selected = ['selected' if X is not None else 'not selected' for X in Xs_t]

    n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0
    for X, t in zip(Xs_selected, X_types):
        if X == 'selected' and t == 'informative':
            n_informative_hits +=1
        if X == 'not selected' and t == 'random':
            n_random_hits +=1
        if X == 'selected' and t == 'weak':
            n_weak_hits +=1

    if verbose > 0:
        print('InputSelector selected {} out of {} informative inputs'
              .format(n_informative_hits, n_informative_Xs))
        print('InputSelector filtered out {} out of {} random inputs'
              .format(n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs))   
        print('InputSelector selected out {} out of {} weakly informative inputs'
              .format(n_weak_hits, n_weak_Xs))

    return n_informative_hits, n_random_hits, n_weak_hits

def _test_weak_strong_rgr_input_discrimination(input_selector, n_weak = 5, n_strong = 5, 
                                               weak_noise_sd = 0.25, seed = None, **sklearn_params):
    n_random = n_weak + n_strong
    n_Xs = n_weak + n_strong + n_random
    n_informative_hits, n_random_hits, n_weak_hits = _select_synthetic_regression(input_selector, 
                                                                         n_informative_Xs=n_strong,
                                                                         n_weak_Xs=n_weak, 
                                                                         n_random_Xs=n_random,
                                                                         weak_noise_sd=weak_noise_sd,
                                                                         seed = seed, 
                                                                         **sklearn_params)
    passed = True
    if n_informative_hits != n_strong:
        passed = False
    if n_weak_hits != 0:
        passed = False
    if n_random_hits != (n_Xs - n_weak - n_strong):
        passed = False
    return passed  

def _test_weak_rgr_input_detection(input_selector, n_weak = 5, n_strong = 5, 
                                   weak_noise_sd = 0.25, seed = None, **sklearn_params):
    n_random = n_weak + n_strong
    n_Xs = n_weak + n_strong + n_random
    n_informative_hits, n_random_hits, n_weak_hits = _select_synthetic_regression(input_selector, 
                                                                          n_informative_Xs=n_strong,
                                                                         n_weak_Xs=n_weak, 
                                                                         n_random_Xs=n_random,
                                                                         weak_noise_sd=weak_noise_sd,
                                                                         seed = seed, 
                                                                         **sklearn_params)
    passed = True
    if n_informative_hits != n_strong:
        passed = False
    if n_weak_hits != n_weak:
        passed = False
    if n_random_hits != (n_Xs - n_weak - n_strong):
        passed = False
    return passed    

def test_SelectKBestInputs_weak_strong_rgr_input_discrimination(seed=42):
    # python3 test_input_selectors.py TestInputSelectors.test_SelectKBestInputs_weak_strong_rgr_input_discrimination
    k = 5
    
    sklearn_params = {'n_targets':1, 
                  'n_samples':2000, 
                  'n_features':30, 
                  'n_informative':20
                  }
    
    input_selector = SelectKBestInputs(score_func=f_regression, aggregator=np.mean, k=k)
    passed = _test_weak_strong_rgr_input_discrimination(input_selector, n_weak=k, 
                                                        n_strong=k, weak_noise_sd=10, 
                                                        seed=seed, **sklearn_params)
    assert passed==True, 'SelectKBestInputs failed to discriminate between weak & strong regression input matrices'
    
def test_SelectKBestInputs_weak_cls_input_detection(seed=42):
    k = 10
    sklearn_params = {'n_targets':1, 
                  'n_samples':2000, 
                  'n_features':30, 
                  'n_informative':20
                  }   
    input_selector = SelectKBestInputs(score_func=f_regression, aggregator=np.mean, k=k)
    passed = _test_weak_rgr_input_detection(input_selector, n_weak=int(k/2), 
                                            n_strong=k - int(k/2), weak_noise_sd=0.5, 
                                            seed=seed, **sklearn_params)
    assert passed == True, 'SelectKBestInputs failed to detect all week input matrices'

In [96]:
test_SelectKBestInputs_weak_cls_input_detection(seed=None)

InputSelector selected 5 out of 5 informative inputs
InputSelector filtered out 10 out of 10 random inputs
InputSelector selected out 5 out of 5 weakly informative inputs
