In [1]:
from chofer_tda_datasets import Reininghaus2014ShrecReal, SciNe01EEGBottomTopFiltration
from chofer_tda_datasets.transforms import Hdf5GroupToDict, Hdf5GroupToDictSelector
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sys
from IPython.display import clear_output

def bendich_vectorization(dgm, num_dims=100):
    persistences = [d-b for b, d in dgm]
    v = sorted(persistences, reverse=True)
    if len(v) < num_dims:
        v += [0]*(num_dims - len(v))
        
    return v[:num_dims]


def svm_linear_standard_scaled_c_optimized(pca_num_dims=None):
    grid = {'C': [0.1, 1, 10, 100]}
    clf = GridSearchCV(cv=3, 
                       estimator=LinearSVC(),
                       param_grid=grid, 
                       n_jobs=10
                    )
    
    pipeline_members = []
    pipeline_members.append(('scaler', StandardScaler()))
    
    if pca_num_dims is not None:
        pipeline_members.append(('pca', PCA(pca_num_dims)))
        
    pipeline_members.append(clf)
    
    return Pipeline(pipeline_members)


def bendich_vectorization_generic_experiment(dataset, 
                                             get_vectorization_callback, 
                                             num_dims, 
                                             pca_num_dims=None):
    
    
    train_size=0.9
    
    splitter = StratifiedShuffleSplit(n_splits=10, 
                                      train_size=train_size, 
                                      test_size=1-train_size, 
                                      random_state=123)
    train_test_splits = list(splitter.split(X=dataset.targets, y=dataset.targets))
    train_test_splits = [(train_i.tolist(), test_i.tolist()) for train_i, test_i in train_test_splits]


    acc = []   
    
    X = []
    y = []    
    
    for i, (x_i, y_i) in enumerate(dataset):
        clear_output(wait=True)
        print('loading data ... ', i, end='\r')
        sys.stdout.flush()
        v = get_vectorization_callback(x_i, num_dims=num_dims)
        X.append(v)
        y.append(int(y_i))
        
    X = np.array(X)
    y = np.array(y)
    print('')

    for run_i, (train_i, test_i) in enumerate(train_test_splits):
        print('run', run_i, end='\r')
        X_train = X[train_i]
        y_train = y[train_i]
        classifier = svm_linear_standard_scaled_c_optimized(pca_num_dims=)                 
        classifier.fit(X_train, y_train)

        X_test = X[test_i]
        y_test = y[test_i]
        
        if pca_dims is not None:
            X_test = pca.transform(X_test)

        y_pred = classifier.predict(X_test)
        acc.append(accuracy_score(y_test, y_pred))
        
    return acc

  from ._conv import register_converters as _register_converters


In [4]:
ds_shrec_real = Reininghaus2014ShrecReal(data_root_folder_path='/scratch1/chofer/jmlr2018_data/')
ds_shrec_real.data_transforms = [Hdf5GroupToDict()]

def shrec_real_bendich_vectorization(input_dict, num_dims):
    ret_val = []
    for scale in range(1, 11):
        for dim in ['0', '1']:
            x = input_dict[str(scale)][dim]
            ret_val += bendich_vectorization(x, num_dims=num_dims)
            
    return ret_val

results = []
for i, num_dim in enumerate([5, 10, 20, 40, 80, 160]):
    print('vect dim', num_dim)
    acc = bendich_vectorization_generic_experiment(ds_shrec_real, 
                                                   shrec_real_bendich_vectorization, 
                                                   num_dim)
    results.append(acc)
print('')
[np.mean(r) for r in results]    

loading data ...  399
run 6

Process ForkPoolWorker-361:


KeyboardInterrupt: 

In [2]:
ds_scine_eeg = SciNe01EEGBottomTopFiltration(data_root_folder_path='/scratch1/chofer/jmlr2018_data/')
sensor_indices = [str(i) for i in ds_scine_eeg.sensor_configurations['low_resolution_whole_head']]
selection = {'top': sensor_indices, 'bottom': sensor_indices}
selector = Hdf5GroupToDictSelector(selection)

ds_scine_eeg.data_transforms = [selector]

def scine_bendich_vectorization(input_dict, num_dims):
    ret_val = []
    for filt in ['top', 'bottom']:
        for sensor_i in sensor_indices:
            x = input_dict[filt][sensor_i]
            ret_val += bendich_vectorization(x, num_dims=num_dims)
            
    return ret_val

results = []
for i, num_dim in enumerate([5, 10, 20, 40, 80, 160]):    
    print('vect dim', num_dim)
    pca_dims = int(len(sensor_indices)*2*num_dim/10)
    acc = bendich_vectorization_generic_experiment(ds_scine_eeg, 
                                                   scine_bendich_vectorization, 
                                                   num_dim,
                                                   pca_dims=pca_dims)
    results.append(acc)
print('')
[np.mean(r) for r in results]    

loading data ...  31499
run 9


[0.2462857142857143,
 0.2548888888888889,
 0.26730158730158726,
 0.2771111111111111,
 0.30165079365079367,
 0.29495238095238097]

In [4]:
[np.mean(r) for r in results]   

[0.2462857142857143,
 0.2548888888888889,
 0.26730158730158726,
 0.2771111111111111,
 0.30165079365079367,
 0.29495238095238097]