# Matrix Selection Test

In [97]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.svm import SVC

In [2]:
ray.shutdown()

In [26]:
import importlib
importlib.reload(pipecaster.model_selection)

<module 'pipecaster.model_selection' from '/Users/john/trading/src/pipecaster/pipecaster/model_selection.py'>

In [1]:
import timeit
import multiprocessing
import ray
import numpy as np

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import sklearn.model_selection
import pipecaster.model_selection


clf = RandomForestClassifier(n_estimators=100, random_state = 42)

X, y = make_classification(n_samples=500, 
                           n_features=400, 
                           n_informative=200, 
                           n_redundant=5,
                           n_repeated=5,
                           class_sep=1)

sklearn_predictions = sklearn.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = 1)
sklearn_auc = roc_auc_score(y, sklearn_predictions)

pipecaster_predictions = pipecaster.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = 1)
pipecaster_auc = roc_auc_score(y, sklearn_predictions)

assert np.array_equal(sklearn_predictions, pipecaster_predictions) == True

assert sklearn_auc == pipecaster_auc

assert pipecaster_auc > 0.5

n_cpus = multiprocessing.cpu_count()
if n_cpus > 1:
    try:
        ray.nodes()
    except RuntimeError:
        ray.init()
    
    SETUP_CODE = ''' 
import pipecaster.model_selection'''
  
    TEST_CODE = ''' 
pipecaster.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = 1)'''
    t_serial = timeit.timeit(setup = SETUP_CODE, 
                          stmt = TEST_CODE, 
                          globals = locals(), 
                          number = 10) 
    print(t_serial)
    
    TEST_CODE = ''' 
pipecaster.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = {})'''.format(n_cpus)
    t_parallel = timeit.timeit(setup = SETUP_CODE, 
                          stmt = TEST_CODE, 
                          globals = locals(), 
                          number = 10) 
    print(t_parallel)

    if t_parallel > t_serial:
        import warnings
        warnging.warn('parallel not faster than serial')
    
    parallel_predictions = pipecaster.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = n_cpus)
    parallel_auc = roc_auc_score(y, parallel_predictions)

    assert np.array_equal(sklearn_predictions, parallel_predictions) == True

    assert sklearn_auc == parallel_auc

    assert parallel_auc > 0.5

2020-11-18 15:13:31,839	INFO resource_spec.py:212 -- Starting Ray with 3.91 GiB memory available for workers and up to 1.95 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-11-18 15:13:32,227	INFO services.py:1148 -- View the Ray dashboard at [1m[32mlocalhost:8267[39m[22m


28.754230861
12.443095313


In [42]:
t_sklearn_1

0.9303939359997457

In [43]:
t_sklearn_all

2.9638689019993762

In [44]:
t_pipecaster_1

0.909355420000793

In [45]:
t_pipecaster_all

0.4500524299992321

In [71]:
class foobar:
    def __init__(self):
        self.x = 222
        
    def f(self):
        print(locals()['self'].x)

In [72]:
foobar().f()

222


In [64]:
ray.shutdown()

In [65]:
try:
    ray.nodes()
except RuntimeError:
    ray.init()

2020-11-18 10:54:37,975	INFO resource_spec.py:212 -- Starting Ray with 4.0 GiB memory available for workers and up to 2.0 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-11-18 10:54:38,249	INFO services.py:1148 -- View the Ray dashboard at [1m[32mlocalhost:8267[39m[22m


In [33]:
pipecaster_predictions = pipecaster.model_selection.cross_val_predict(clf, X, y, cv = 5, n_jobs = 5)

In [16]:
from sklearn.model_selection._split import check_cv
import pipecaster.model_selection
from pipecaster.utility import get_clone
from sklearn.preprocessing import LabelEncoder

cv = 5
groups = None
verbose = 0
method='predict'
cv = check_cv(cv, y, classifier=pipecaster.model_selection.is_classifier(clf))
splits = [(tr, te) for tr, te in cv.split(X, y, groups)]

In [17]:
splits_2 = list(cv.split(X, y, groups))

In [25]:
splits[4][1][:10]

array([397, 400, 401, 403, 404, 405, 406, 407, 408, 409])

In [24]:
splits_2[4][1][:10]

array([397, 400, 401, 403, 404, 405, 406, 407, 408, 409])

In [22]:
encode = method in ['decision_function', 'predict_proba',
                        'predict_log_proba'] and y is not None
if encode:
    y = np.asarray(y)
    if y.ndim == 1:
        le = LabelEncoder()
        y = le.fit_transform(y)
    elif y.ndim == 2:
        y_enc = np.zeros_like(y, dtype=np.int)
        for i_label in range(y.shape[1]):
            y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])
        y = y_enc

In [23]:
Xr = ray.put(X)
yr = ray.put(y)
fit_params = ray.put(None)

train_indices, test_indices = splits[0]
job1 = pipecaster.model_selection.ray_fit_and_predict.remote(ray.put(get_clone(clf)), Xr, yr, 
                                                            train_indices, test_indices, 
                                                            verbose, fit_params, method)

train_indices, test_indices = splits[1]
job2 = pipecaster.model_selection.ray_fit_and_predict.remote(ray.put(get_clone(clf)), Xr, yr, 
                                                            train_indices, test_indices, 
                                                            verbose, fit_params, method)

train_indices, test_indices = splits[2]
job3 = pipecaster.model_selection.ray_fit_and_predict.remote(ray.put(get_clone(clf)), Xr, yr, 
                                                            train_indices, test_indices, 
                                                            verbose, fit_params, method)

train_indices, test_indices = splits[3]
job4 = pipecaster.model_selection.ray_fit_and_predict.remote(ray.put(get_clone(clf)), Xr, yr, 
                                                            train_indices, test_indices, 
                                                            verbose, fit_params, method)

train_indices, test_indices = splits[4]
job5 = pipecaster.model_selection.ray_fit_and_predict.remote(ray.put(get_clone(clf)), Xr, yr, 
                                                            train_indices, test_indices, 
                                                            verbose, fit_params, method)

In [24]:
prediction_blocks = ray.get([job1, job2, job3, job4, job5])

In [31]:
predictions, indices = prediction_blocks[1]

In [33]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0])

TypeError: Singleton array array(ObjectID(ffffffffffffffffffffffff010000883e000000), dtype=object) cannot be considered a valid collection.

In [None]:
%debug


> [0;32m/Users/john/trading/src/venv/lib/python3.7/site-packages/sklearn/utils/validation.py[0m(196)[0;36m_num_samples[0;34m()[0m
[0;32m    194 [0;31m        [0;32mif[0m [0mlen[0m[0;34m([0m[0mx[0m[0;34m.[0m[0mshape[0m[0;34m)[0m [0;34m==[0m [0;36m0[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    195 [0;31m            raise TypeError("Singleton array %r cannot be considered"
[0m[0;32m--> 196 [0;31m                            " a valid collection." % x)
[0m[0;32m    197 [0;31m        [0;31m# Check that shape is returning an integer or default to len[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    198 [0;31m        [0;31m# Dask dataframes may not return numeric shape[0] value[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> y[:10]
*** NameError: name 'y' is not defined
ipdb> X[:10]
*** NameError: name 'X' is not defined
ipdb> x[:10]
*** IndexError: too many indices for array


RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

In [None]:
t_sklearn_1

0.7250000000000001

In [35]:
pipecaster_auc

0.7250000000000001

In [38]:
import numpy as np


True

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import AUC
from sklearn.model_selection 


def test_metaclassifier(metaclassifier, n_Xs=5):
    
        AUCs = []
        
        for in range(5):
        
        Xs, y, X_types = make_multi_input_classification(n_classes = 2, 
                                            n_Xs=n_Xs, 
                                            n_informative_Xs=n_informative_Xs, 
                                            n_weak_Xs=n_weak_Xs,
                                            n_samples=1000, 
                                            n_features=500, 
                                            n_informative = 100,
                                            n_redundant=0,
                                            n_repeated=0,
                                            class_sep=2.0,
                                            weak_noise_sd=weak_noise_sd)
        clf = pc.Pipeline(n_inputs = 10)
        layer0 = clf.get_next_layer()
        layer0[:] = StandardScaler()
        layer1 = clf.get_next_layer()
        layer1[:] = LogisticRegression()
        layer2 = clf.get_next_layer()
        layer2[:] = pc.MetaClassifier(SVC())
        
        
        Xs_t = clf.fit_transform(Xs, y)
        
        
        Xs_selected = ['selected' if X is not None else 'not selected' for X in Xs_t]

        n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0
        for X, t in zip(Xs_selected, X_types):
            if X == 'selected' and t == 'informative':
                n_informative_hits +=1
            if X == 'not selected' and t == 'random':
                n_random_hits +=1
            if X == 'selected' and t == 'weak':
                n_weak_hits +=1

        print('InputSelector selected {} out of {} informative inputs'
              .format(n_informative_hits, n_informative_Xs))
        print('InputSelector filtered out {} out of {} random inputs'
              .format(n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs))   
        print('InputSelector selected out {} out of {} weakly informative inputs'
              .format(n_weak_hits, n_weak_Xs))
        
        return n_informative_hits, n_random_hits, n_weak_hits

In [104]:
KNeighborsClassifier().fit(X,y)._more_tags()

{'multioutput': True}

In [117]:
X = np.random.rand(100,5)
y = np.random.choice(['a','b'], 100)
clf = KNeighborsClassifier().fit(X,y)

from sklearn.model_selection import cross_val_predict
result = cross_val_predict(clf, X, y, cv=3)

In [115]:
result.shape

(100,)

In [119]:
from sklearn.model_selection._split import check_cv

cv = check_cv(3)

In [120]:
clf._estimator_type

'classifier'

In [96]:
from sklearn.model_selection import StratifiedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0.1, 0.3, 1.2, 1.4])
skf = StratifiedKFold(n_splits=2)
skf.get_n_splits(X, y)
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.

In [None]:
from sklearn.model_selection import KFold
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
>>> y = np.array([1, 2, 3, 4])
>>> kf = KFold(n_splits=2)
>>> kf.get_n_splits(X)
2
>>> print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)
>>> for train_index, test_index in kf.split(X):
...     print("TRAIN:", train_index, "TEST:", test_index)
...     X_train, X_test = X[train_index], X[test_index]
...     y_train, y_test = y[train_index], y[test_index]

In [121]:
split_predictions = [np.random.rand(10,5) for x in range(5)]

In [126]:
result = np.concatenate(split_predictions, axis = 0)
result.shape

(50, 5)

In [142]:
class foobar:
    def f(self):
        if 'f' in dir():
            print('true')

fb = foobar()
fb.f()

In [130]:
x = 222
setattr(f, 'x', 222)

In [138]:
dir()[4]

'MultinomialNB'

In [68]:
k = 5
input_selector = pc.SelectKBestInputs(score_func=f_classif, aggregator=np.sum, k=k)
n_informative_hits, n_random_hits, n_weak_hits = test_input_selector(input_selector, 
                                                                     n_Xs = 20, 
                                                                     n_informative_Xs = 5,
                                                                     n_weak_Xs = 5, 
                                                                     weak_noise_sd = 25)
self.assertEqual(a, b, 'message')

InputSelector selected 5 out of 5 informative inputs
InputSelector filtered out 10 out of 10 random inputs
InputSelector selected out 0 out of 5 weakly informative inputs


In [69]:
k = 10
input_selector = pc.SelectKBestInputs(score_func=f_classif, aggregator=np.sum, k=10)
n_informative_hits, n_random_hits, n_weak_hits = test_input_selector(input_selector, 
                                                                     n_Xs = 20, 
                                                                     n_informative_Xs = 5,
                                                                     n_weak_Xs = 5, 
                                                                     weak_noise_sd = 20)

InputSelector selected 5 out of 5 informative inputs
InputSelector filtered out 10 out of 10 random inputs
InputSelector selected out 5 out of 5 weakly informative inputs


In [7]:
Xs[slice_] = results
Xs

NameError: name 'results' is not defined

In [None]:
results

In [None]:
X = StandardScaler().fit_transform(np.arange(9).reshape(3,3), range(3))

In [None]:
X

In [None]:
x = pc.SelectKBestInputs(score_func=f_classif, aggregator=np.sum, k=3)
x

In [4]:
import pipecaster.synthetic_data as synthetic_data


ModuleNotFoundError: No module named 'pipecaster.synthetic_data'

In [None]:
n_Xs=20, n_informative_Xs=5, n_weak_Xs=5, weak_noise_sd=10, verbose = 0, seed = None):
        
        Xs, y, X_types = synthetic_data.make_multi_input_classification(n_classes = 2, 
                                            n_Xs=n_Xs, 
                                            n_informative_Xs=n_informative_Xs, 
                                            n_weak_Xs=n_weak_Xs,
                                            n_samples=1000, 
                                            n_features=100, 
                                            n_informative=75,
                                            n_redundant=0,
                                            n_repeated=0,
                                            class_sep=2.0,
                                            weak_noise_sd=weak_noise_sd,
                                            seed=seed)

In [None]:


mcls = pc.MultiInputPipeline(n_inputs=6)

layer0 = mcls.get_next_layer() # get new layer array of length n_inputs, all initialized to PassThrough()
layer0[:5] = SimpleImputer() 
layer0[5] = CountVectorizer()

layer1 = mcls.get_next_layer()
layer1[:5] = StandardScaler()
layer1[5] = TfidfTransformer()

layer2 = mcls.get_next_layer() 
layer2[:] = SelectKBest(f_classif, k = 100)

layer3 = mcls.get_next_layer()
layer3[:5] = pc.SelectKBestInputs(score_func=f_classif, aggregator=np.sum, k=3)

