In [6]:
from chofer_tda_datasets import Reininghaus2014ShrecReal, SciNe01EEGBottomTopFiltration
from chofer_tda_datasets.transforms import Hdf5GroupToDict, Hdf5GroupToDictSelector
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sys
from IPython.display import clear_output
from collections import defaultdict

def bendich_vectorization(dgm, num_dims=100):
    persistences = [d-b for b, d in dgm]
    v = sorted(persistences, reverse=True)
    if len(v) < num_dims:
        v += [0]*(num_dims - len(v))
        
    return v[:num_dims]


def svm_linear_standard_scaled_c_optimized(pca_num_dims=None):
    grid = {'C': [0.1, 1, 10, 100]}
    clf = GridSearchCV(cv=3, 
                       estimator=LinearSVC(),
                       param_grid=grid, 
                       n_jobs=10
                    )
    
    pipeline_members = []
    pipeline_members.append(('scaler', StandardScaler()))
    
    if pca_num_dims is not None:
        pipeline_members.append(('pca', PCA(pca_num_dims)))
        
    pipeline_members.append(('classifier', clf))
    
    return Pipeline(pipeline_members)


def bendich_vectorization_generic_experiment(dataset, 
                                             vectorization_callback, 
                                             vectorization_dimensions, 
                                             pca_num_dims=None):
    
    
    train_size = 0.9
    
    splitter = StratifiedShuffleSplit(n_splits=10, 
                                      train_size=train_size, 
                                      test_size=1-train_size, 
                                      random_state=123)
    train_test_splits = list(splitter.split(X=dataset.targets, y=dataset.targets))
    train_test_splits = [(train_i.tolist(), test_i.tolist()) for train_i, test_i in train_test_splits]


    return_value = {}      
    
    X = []
    y = []    
    
    for i, (x_i, y_i) in enumerate(dataset):
        clear_output(wait=True)
        print('loading data ... ', i, end='\r')
        sys.stdout.flush()
        v = vectorization_callback(x_i, num_dims=max(vectorization_dimensions))
        X.append(v)
        y.append(int(y_i))
        
#     X = np.array(X)
    y = np.array(y)
    print('')
    
    for dim in vectorization_dimensions:
        print('dimension =', dim, ":")
        
        return_value_dim = defaultdict(list)
        return_value[dim] = return_value_dim
        
        X_dim = []
        for x in X:
            X_dim.append(sum([v[:dim] for v in x], []))
            
        X_dim = np.array(X_dim)

        for run_i, (train_i, test_i) in enumerate(train_test_splits):
            print('run', run_i, end='\r')
            X_train = X_dim[train_i]
            y_train = y[train_i]
            
            X_test = X_dim[test_i]
            y_test = y[test_i]
            
            classifier = svm_linear_standard_scaled_c_optimized(pca_num_dims=pca_num_dims)                 
            classifier.fit(X_train, y_train)            

            y_pred = classifier.predict(X_test)
            return_value_dim['accuracies'].append(accuracy_score(y_test, y_pred))
            
        return_value_dim['classifier'].append(classifier)
        print('')

    return return_value

  from ._conv import register_converters as _register_converters


In [10]:
ds_shrec_real = Reininghaus2014ShrecReal(data_root_folder_path='/scratch1/chofer/jmlr2018_data/')
ds_shrec_real.data_transforms = [Hdf5GroupToDict()]

def shrec_real_bendich_vectorization(input_dict, num_dims):
    ret_val = []
    for scale in range(1, 11):
        for dim in ['0', '1']:
            x = input_dict[str(scale)][dim]
            ret_val.append(bendich_vectorization(x, num_dims=num_dims))
            
    return ret_val

shrec_result = bendich_vectorization_generic_experiment(ds_shrec_real, 
                                                        shrec_real_bendich_vectorization, 
                                                        vectorization_dimensions=[5, 10, 20, 40, 80, 160])


with open('./bendich_exp_shrec_real.pickle', 'bw') as f:
    pickle.dump(shrec_result, f)
    
for k, v in shrec_result.items():
    print('dimension', k, 'accuracy:', np.mean(v['accuracies']))


loading data ...  399
dimension = 5 :
run 9
dimension = 10 :
run 9
dimension = 20 :
run 9
dimension = 40 :
run 9
dimension = 80 :
run 9
dimension = 160 :
run 9
dimension 5 accuracy: 0.4125
dimension 10 accuracy: 0.30249999999999994
dimension 20 accuracy: 0.2825000000000001
dimension 40 accuracy: 0.28750000000000003
dimension 80 accuracy: 0.27999999999999997
dimension 160 accuracy: 0.2875


In [20]:
ds_scine_eeg = SciNe01EEGBottomTopFiltration(data_root_folder_path='/scratch1/chofer/jmlr2018_data/')
sensor_indices = [str(i) for i in ds_scine_eeg.sensor_configurations['low_resolution_whole_head']]
selection = {'top': sensor_indices, 'bottom': sensor_indices}
selector = Hdf5GroupToDictSelector(selection)

ds_scine_eeg.data_transforms = [selector]

def scine_bendich_vectorization(input_dict, num_dims):
    ret_val = []
    for filt in ['top', 'bottom']:
        for sensor_i in sensor_indices:
            x = input_dict[filt][sensor_i]
            ret_val.append(bendich_vectorization(x, num_dims=num_dims))
            
    return ret_val

eeg_result = bendich_vectorization_generic_experiment(ds_scine_eeg, 
                                                   scine_bendich_vectorization, 
                                                   vectorization_dimensions=[5, 10, 20, 40, 80, 160],
                                                   pca_num_dims=None)

with open('./bendich_exp_scitrecs_eeg.pickle', 'bw') as f:
    pickle.dump(eeg_result, f)
    
for k, v in eeg_result.items():
    print('dimension', k, 'accuracy:', np.mean(v['accuracies']))

loading data ...  31499
dimension = 5 :
run 9
dimension = 10 :
run 9
dimension = 20 :
run 9
dimension = 40 :
run 9
dimension = 80 :
run 9
dimension = 160 :
run 9
dimension 5 accuracy: 0.19006349206349207
dimension 10 accuracy: 0.19450793650793652
dimension 20 accuracy: 0.19520634920634922
dimension 40 accuracy: 0.19933333333333333
dimension 80 accuracy: 0.20984126984126986
dimension 160 accuracy: 0.2109206349206349


In [7]:
with open('./bendich_exp_scitrecs_eeg.pickle', 'br') as f:
    result = pickle.load(f)

In [5]:
for k, v in result.items():
    print(k, np.mean(v['accuracies']))

5 0.19006349206349207
10 0.19450793650793652
20 0.19520634920634922
40 0.19933333333333333
80 0.20984126984126986
160 0.2109206349206349


In [None]:
results[]