In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('esem_act_deact_zscore_groups.csv',sep=';',decimal='.')

#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)


numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 13 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [3]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [4]:
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high

In [5]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 4
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'AllROIs/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'

    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [03:54<1:55:02, 86.28s/pipeline]

Generation 1 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  50%|█████     | 60/120 [06:02<1:01:31, 61.53s/pipeline]

Generation 2 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  69%|██████▉   | 83/120 [11:05<46:35, 75.57s/pipeline]  

Generation 3 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  86%|████████▌ | 103/120 [11:10<10:39, 37.63s/pipeline]

Generation 4 - Current best internal CV score: 0.9755555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.9755555555555556

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=0.1, dual=False, penalty=l2)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:07<02:27,  1.84s/pipeline]

Generation 1 - Current best internal CV score: 0.9511111111111111


Optimization Progress:  50%|█████     | 60/120 [00:57<08:23,  8.38s/pipeline]

Generation 2 - Current best internal CV score: 0.9511111111111111


Optimization Progress:  67%|██████▋   | 80/120 [05:44<1:00:04, 90.11s/pipeline]

Generation 3 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  84%|████████▍ | 101/120 [10:55<39:14, 123.91s/pipeline] 

Generation 4 - Current best internal CV score: 0.9622222222222222


                                                                               

Generation 5 - Current best internal CV score: 0.9622222222222222

Best pipeline: LinearSVC(input_matrix, C=5.0, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:57<52:39, 39.49s/pipeline]  

Generation 1 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  50%|█████     | 60/120 [02:00<19:36, 19.61s/pipeline]

Generation 2 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  67%|██████▋   | 80/120 [02:07<06:51, 10.29s/pipeline]

Generation 3 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  83%|████████▎ | 100/120 [02:14<01:53,  5.67s/pipeline]

Generation 4 - Current best internal CV score: 0.9622222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.9733333333333334

Best pipeline: LogisticRegression(XGBClassifier(ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.1, min_samples_leaf=4, min_samples_split=7, n_estimators=100), learning_rate=0.001, max_depth=3, min_child_weight=17, n_estimators=100, nthread=1, subsample=0.6000000000000001), C=20.0, dual=False, penalty=l2)


  if diff:


0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:11<05:37,  4.22s/pipeline]

Generation 1 - Current best internal CV score: 0.96


Optimization Progress:  51%|█████     | 61/120 [05:13<1:03:36, 64.69s/pipeline]

Generation 2 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  68%|██████▊   | 81/120 [07:50<42:02, 64.68s/pipeline]  

Generation 3 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  84%|████████▍ | 101/120 [07:52<14:23, 45.44s/pipeline]

Generation 4 - Current best internal CV score: 0.9755555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.9755555555555556

Best pipeline: LinearSVC(SelectFwe(MinMaxScaler(input_matrix), alpha=0.027), C=25.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:13<02:26,  1.84s/pipeline]

Generation 1 - Current best internal CV score: 0.9755555555555556


Optimization Progress:  50%|█████     | 60/120 [00:26<02:07,  2.12s/pipeline]

Generation 2 - Current best internal CV score: 0.9755555555555556


Optimization Progress:  67%|██████▋   | 80/120 [00:37<01:31,  2.30s/pipeline]

Generation 3 - Current best internal CV score: 0.9755555555555556


Optimization Progress:  83%|████████▎ | 100/120 [00:48<00:41,  2.07s/pipeline]

Generation 4 - Current best internal CV score: 0.9755555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.9888888888888889

Best pipeline: GradientBoostingClassifier(LogisticRegression(ZeroCount(input_matrix), C=10.0, dual=False, penalty=l2), learning_rate=1.0, max_depth=8, max_features=0.9500000000000001, min_samples_leaf=16, min_samples_split=16, n_estimators=100, subsample=1.0)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:42<36:53, 27.67s/pipeline]

Generation 1 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  50%|█████     | 60/120 [02:13<12:42, 12.71s/pipeline]

Generation 2 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  67%|██████▋   | 80/120 [02:47<09:20, 14.02s/pipeline]

Generation 3 - Current best internal CV score: 0.9755555555555556


Optimization Progress:  83%|████████▎ | 100/120 [04:45<13:01, 39.09s/pipeline]

Generation 4 - Current best internal CV score: 0.9755555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.9755555555555556

Best pipeline: LogisticRegression(PCA(input_matrix, iterated_power=5, svd_solver=randomized), C=20.0, dual=False, penalty=l1)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:30<16:03, 12.05s/pipeline]

Generation 1 - Current best internal CV score: 0.9377777777777778


Optimization Progress:  50%|█████     | 60/120 [03:11<52:13, 52.23s/pipeline]  

Generation 2 - Current best internal CV score: 0.9377777777777778


Optimization Progress:  67%|██████▋   | 80/120 [03:15<17:26, 26.17s/pipeline]

Generation 3 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  84%|████████▍ | 101/120 [09:00<36:44, 116.02s/pipeline] 

Generation 4 - Current best internal CV score: 0.9733333333333334


                                                                               

Generation 5 - Current best internal CV score: 0.9733333333333334

Best pipeline: LinearSVC(GaussianNB(input_matrix), C=0.1, dual=False, loss=squared_hinge, penalty=l2, tol=0.001)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:18<15:05, 11.32s/pipeline]

Generation 1 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  51%|█████     | 61/120 [06:22<1:33:04, 94.66s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  68%|██████▊   | 81/120 [06:26<30:30, 46.93s/pipeline]  

Generation 3 - Current best internal CV score: 0.96


Optimization Progress:  84%|████████▍ | 101/120 [06:40<05:57, 18.80s/pipeline]

Generation 4 - Current best internal CV score: 0.96


                                                                              

Generation 5 - Current best internal CV score: 0.96

Best pipeline: ExtraTreesClassifier(LogisticRegression(input_matrix, C=20.0, dual=True, penalty=l2), bootstrap=False, criterion=gini, max_features=0.45, min_samples_leaf=7, min_samples_split=18, n_estimators=100)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:41<10:41,  8.02s/pipeline]

Generation 1 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  50%|█████     | 60/120 [01:21<08:40,  8.67s/pipeline]

Generation 2 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  67%|██████▋   | 80/120 [01:59<09:20, 14.02s/pipeline]

Generation 3 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  83%|████████▎ | 100/120 [02:56<07:59, 23.97s/pipeline]

Generation 4 - Current best internal CV score: 0.9733333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9866666666666667

Best pipeline: LogisticRegression(MinMaxScaler(MinMaxScaler(OneHotEncoder(BernoulliNB(input_matrix, alpha=10.0, fit_prior=False), minimum_fraction=0.1, sparse=False))), C=25.0, dual=True, penalty=l2)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:16<06:47,  5.10s/pipeline]

Generation 1 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  50%|█████     | 60/120 [00:34<04:41,  4.69s/pipeline]

Generation 2 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  68%|██████▊   | 81/120 [05:34<1:00:44, 93.45s/pipeline]

Generation 3 - Current best internal CV score: 0.9733333333333334


Optimization Progress:  85%|████████▌ | 102/120 [10:35<14:20, 47.82s/pipeline] 

Generation 4 - Current best internal CV score: 0.9733333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9866666666666667

Best pipeline: LogisticRegression(Normalizer(GaussianNB(MinMaxScaler(input_matrix)), norm=max), C=0.5, dual=False, penalty=l2)
0.8333333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:24<2:13:32, 101.42s/pipeline]

Generation 1 - Current best internal CV score: 0.9751515151515152


Optimization Progress:  52%|█████▏    | 62/120 [10:26<1:34:41, 97.96s/pipeline] 

Generation 2 - Current best internal CV score: 0.9751515151515152


Optimization Progress:  68%|██████▊   | 82/120 [14:16<1:26:58, 137.34s/pipeline]

Generation 3 - Current best internal CV score: 0.9751515151515152


Optimization Progress:  85%|████████▌ | 102/120 [15:14<25:23, 84.64s/pipeline]  

Generation 4 - Current best internal CV score: 0.9751515151515152


                                                                              

Generation 5 - Current best internal CV score: 0.9751515151515152

Best pipeline: LogisticRegression(GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=10, max_features=0.2, min_samples_leaf=5, min_samples_split=18, n_estimators=100, subsample=0.8500000000000001), C=20.0, dual=True, penalty=l2)
0.7083333333333334


Optimization Progress:  34%|███▍      | 41/120 [06:12<2:42:50, 123.67s/pipeline]

Generation 1 - Current best internal CV score: 0.9357575757575759


Optimization Progress:  51%|█████     | 61/120 [06:36<48:23, 49.21s/pipeline]   

Generation 2 - Current best internal CV score: 0.9436363636363636


Optimization Progress:  68%|██████▊   | 81/120 [07:00<20:04, 30.88s/pipeline]

Generation 3 - Current best internal CV score: 0.9436363636363636


Optimization Progress:  84%|████████▍ | 101/120 [07:47<07:40, 24.26s/pipeline]

Generation 4 - Current best internal CV score: 0.9503030303030304


                                                                              

Generation 5 - Current best internal CV score: 0.9503030303030304

Best pipeline: LinearSVC(XGBClassifier(input_matrix, learning_rate=0.1, max_depth=9, min_child_weight=13, n_estimators=100, nthread=1, subsample=1.0), C=10.0, dual=False, loss=squared_hinge, penalty=l2, tol=0.1)


  if diff:


0.7708333333333334


Optimization Progress:  33%|███▎      | 40/120 [03:38<1:45:53, 79.42s/pipeline]

Generation 1 - Current best internal CV score: 0.9181818181818182


Optimization Progress:  50%|█████     | 60/120 [03:50<28:58, 28.98s/pipeline]  

Generation 2 - Current best internal CV score: 0.9181818181818182


Optimization Progress:  67%|██████▋   | 80/120 [04:06<12:24, 18.62s/pipeline]

Generation 3 - Current best internal CV score: 0.9181818181818182


Optimization Progress:  83%|████████▎ | 100/120 [04:24<03:46, 11.35s/pipeline]

Generation 4 - Current best internal CV score: 0.9181818181818182


                                                                              

Generation 5 - Current best internal CV score: 0.9315151515151514

Best pipeline: LinearSVC(RFE(LogisticRegression(input_matrix, C=1.0, dual=True, penalty=l2), criterion=entropy, max_features=0.25, n_estimators=100, step=0.9000000000000001), C=0.001, dual=False, loss=squared_hinge, penalty=l2, tol=0.001)
0.75


Optimization Progress:  35%|███▌      | 42/120 [05:26<1:12:19, 55.64s/pipeline]

Generation 1 - Current best internal CV score: 0.9575757575757576


Optimization Progress:  52%|█████▏    | 62/120 [05:30<19:18, 19.97s/pipeline]  

Generation 2 - Current best internal CV score: 0.9575757575757576


Optimization Progress:  68%|██████▊   | 82/120 [05:34<06:24, 10.13s/pipeline]

Generation 3 - Current best internal CV score: 0.9575757575757576


Optimization Progress:  85%|████████▌ | 102/120 [05:51<01:35,  5.30s/pipeline]

Generation 4 - Current best internal CV score: 0.9696969696969697


                                                                              

Generation 5 - Current best internal CV score: 0.9696969696969697

Best pipeline: LinearSVC(SelectFwe(MinMaxScaler(input_matrix), alpha=0.015), C=25.0, dual=True, loss=squared_hinge, penalty=l2, tol=1e-05)
0.7291666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:46<23:26, 17.59s/pipeline]

Generation 1 - Current best internal CV score: 0.9309090909090909


Optimization Progress:  50%|█████     | 60/120 [00:57<06:08,  6.14s/pipeline]

Generation 2 - Current best internal CV score: 0.9309090909090909


Optimization Progress:  67%|██████▋   | 80/120 [01:07<03:17,  4.95s/pipeline]

Generation 3 - Current best internal CV score: 0.9369696969696971


Optimization Progress:  83%|████████▎ | 100/120 [02:08<01:58,  5.92s/pipeline]

Generation 4 - Current best internal CV score: 0.9369696969696971


                                                                              

Generation 5 - Current best internal CV score: 0.9369696969696971

Best pipeline: LogisticRegression(SelectFwe(input_matrix, alpha=0.015), C=0.5, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  34%|███▍      | 41/120 [09:02<1:50:38, 84.03s/pipeline]

Generation 1 - Current best internal CV score: 0.9375757575757575


Optimization Progress:  51%|█████     | 61/120 [09:20<30:45, 31.28s/pipeline]  

Generation 2 - Current best internal CV score: 0.9375757575757575


Optimization Progress:  68%|██████▊   | 82/120 [14:22<1:06:53, 105.61s/pipeline]

Generation 3 - Current best internal CV score: 0.9436363636363637


Optimization Progress:  86%|████████▌ | 103/120 [19:24<40:14, 142.03s/pipeline] 

Generation 4 - Current best internal CV score: 0.9436363636363637


                                                                               

Generation 5 - Current best internal CV score: 0.9503030303030304

Best pipeline: GradientBoostingClassifier(LogisticRegression(GaussianNB(input_matrix), C=15.0, dual=False, penalty=l2), learning_rate=0.01, max_depth=6, max_features=0.25, min_samples_leaf=10, min_samples_split=5, n_estimators=100, subsample=0.7500000000000001)
0.7083333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:54<19:31, 14.64s/pipeline]

Generation 1 - Current best internal CV score: 0.9163636363636364


Optimization Progress:  50%|█████     | 60/120 [01:17<17:04, 17.07s/pipeline]

Generation 2 - Current best internal CV score: 0.9163636363636364


Optimization Progress:  67%|██████▋   | 80/120 [01:36<09:18, 13.97s/pipeline]

Generation 3 - Current best internal CV score: 0.9163636363636364


Optimization Progress:  83%|████████▎ | 100/120 [01:50<03:36, 10.83s/pipeline]

Generation 4 - Current best internal CV score: 0.9230303030303031


                                                                              

Generation 5 - Current best internal CV score: 0.9236363636363638

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=10.0, dual=False, penalty=l1)
0.8333333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:15<2:07:47, 97.06s/pipeline]

Generation 1 - Current best internal CV score: 0.9127272727272727


Optimization Progress:  51%|█████     | 61/120 [05:41<37:48, 38.45s/pipeline]  

Generation 2 - Current best internal CV score: 0.9248484848484848


Optimization Progress:  68%|██████▊   | 81/120 [06:02<18:50, 28.99s/pipeline]

Generation 3 - Current best internal CV score: 0.9248484848484848


Optimization Progress:  85%|████████▌ | 102/120 [11:05<31:27, 104.85s/pipeline] 

Generation 4 - Current best internal CV score: 0.9321212121212122


                                                                               

Generation 5 - Current best internal CV score: 0.9321212121212122

Best pipeline: LogisticRegression(SelectFwe(input_matrix, alpha=0.008), C=0.1, dual=True, penalty=l2)
0.8541666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:51<28:59, 21.74s/pipeline] 

Generation 1 - Current best internal CV score: 0.9175757575757576


Optimization Progress:  50%|█████     | 60/120 [01:58<27:15, 27.26s/pipeline]

Generation 2 - Current best internal CV score: 0.9175757575757576


Optimization Progress:  67%|██████▋   | 80/120 [06:45<1:03:25, 95.13s/pipeline]

Generation 3 - Current best internal CV score: 0.9175757575757576


Optimization Progress:  84%|████████▍ | 101/120 [12:05<29:58, 94.64s/pipeline] 

Generation 4 - Current best internal CV score: 0.9242424242424242


                                                                              

Generation 5 - Current best internal CV score: 0.9242424242424242

Best pipeline: LinearSVC(SelectFwe(input_matrix, alpha=0.01), C=0.01, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
0.7291666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:22<08:03,  6.05s/pipeline]

Generation 1 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  50%|█████     | 60/120 [00:28<03:31,  3.53s/pipeline]

Generation 2 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  67%|██████▋   | 80/120 [05:14<58:20, 87.50s/pipeline]  

Generation 3 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  83%|████████▎ | 100/120 [05:23<14:35, 43.77s/pipeline]

Generation 4 - Current best internal CV score: 0.9375757575757577


                                                                              

Generation 5 - Current best internal CV score: 0.9375757575757577

Best pipeline: LogisticRegression(LogisticRegression(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.1, min_samples_leaf=20, min_samples_split=5, n_estimators=100), C=5.0, dual=False, penalty=l2), C=5.0, dual=False, penalty=l1)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:26<05:48,  4.36s/pipeline]

Generation 1 - Current best internal CV score: 0.9076923076923077


Optimization Progress:  50%|█████     | 60/120 [00:38<03:17,  3.29s/pipeline]

Generation 2 - Current best internal CV score: 0.9076923076923077


Optimization Progress:  68%|██████▊   | 81/120 [05:50<1:01:30, 94.62s/pipeline]

Generation 3 - Current best internal CV score: 0.9179487179487179


Optimization Progress:  84%|████████▍ | 101/120 [06:17<17:13, 54.39s/pipeline] 

Generation 4 - Current best internal CV score: 0.9179487179487179


                                                                              

Generation 5 - Current best internal CV score: 0.9179487179487179

Best pipeline: LogisticRegression(OneHotEncoder(BernoulliNB(input_matrix, alpha=0.001, fit_prior=True), minimum_fraction=0.2, sparse=False), C=25.0, dual=False, penalty=l1)
0.7333333333333333


Optimization Progress:  35%|███▌      | 42/120 [10:19<3:57:05, 182.38s/pipeline]

Generation 1 - Current best internal CV score: 0.8615384615384617


Optimization Progress:  52%|█████▏    | 62/120 [10:31<1:02:53, 65.06s/pipeline] 

Generation 2 - Current best internal CV score: 0.9025641025641026


Optimization Progress:  68%|██████▊   | 82/120 [11:46<35:56, 56.76s/pipeline]  

Generation 3 - Current best internal CV score: 0.9076923076923078


Optimization Progress:  85%|████████▌ | 102/120 [13:05<09:21, 31.18s/pipeline]

Generation 4 - Current best internal CV score: 0.9076923076923078


                                                                              

Generation 5 - Current best internal CV score: 0.9076923076923078

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=0.5, dual=False, penalty=l2)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:15<06:22,  4.78s/pipeline]

Generation 1 - Current best internal CV score: 0.8717948717948717


Optimization Progress:  50%|█████     | 60/120 [00:24<03:37,  3.62s/pipeline]

Generation 2 - Current best internal CV score: 0.8820512820512819


Optimization Progress:  67%|██████▋   | 80/120 [00:37<01:37,  2.44s/pipeline]

Generation 3 - Current best internal CV score: 0.8820512820512821


Optimization Progress:  83%|████████▎ | 100/120 [00:47<00:53,  2.69s/pipeline]

Generation 4 - Current best internal CV score: 0.8820512820512821


                                                                              

Generation 5 - Current best internal CV score: 0.9025641025641026

Best pipeline: LogisticRegression(BernoulliNB(input_matrix, alpha=1.0, fit_prior=True), C=5.0, dual=False, penalty=l1)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:34<17:08, 12.86s/pipeline]

Generation 1 - Current best internal CV score: 0.8974358974358975


Optimization Progress:  51%|█████     | 61/120 [05:43<1:36:54, 98.55s/pipeline]

Generation 2 - Current best internal CV score: 0.8974358974358975


Optimization Progress:  68%|██████▊   | 82/120 [10:47<1:28:06, 139.11s/pipeline]

Generation 3 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  87%|████████▋ | 104/120 [15:51<42:27, 159.24s/pipeline] 

Generation 4 - Current best internal CV score: 0.9333333333333333


                                                                               

Generation 5 - Current best internal CV score: 0.9333333333333333

Best pipeline: LogisticRegression(VarianceThreshold(input_matrix, threshold=0.1), C=1.0, dual=True, penalty=l2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:45<12:27,  9.34s/pipeline]

Generation 1 - Current best internal CV score: 0.9282051282051282


Optimization Progress:  50%|█████     | 60/120 [00:55<04:08,  4.15s/pipeline]

Generation 2 - Current best internal CV score: 0.9282051282051282


Optimization Progress:  68%|██████▊   | 81/120 [05:57<30:47, 47.36s/pipeline]

Generation 3 - Current best internal CV score: 0.9282051282051282


Optimization Progress:  84%|████████▍ | 101/120 [06:04<07:32, 23.80s/pipeline]

Generation 4 - Current best internal CV score: 0.9282051282051282


                                                                              

Generation 5 - Current best internal CV score: 0.9282051282051282

Best pipeline: LogisticRegression(input_matrix, C=20.0, dual=False, penalty=l1)
0.7666666666666667


Optimization Progress:  33%|███▎      | 40/120 [05:52<1:12:09, 54.12s/pipeline]

Generation 1 - Current best internal CV score: 0.8820512820512821


Optimization Progress:  50%|█████     | 60/120 [07:04<29:15, 29.25s/pipeline]  

Generation 2 - Current best internal CV score: 0.8871794871794872


Optimization Progress:  67%|██████▋   | 80/120 [11:32<36:18, 54.47s/pipeline]

Generation 3 - Current best internal CV score: 0.9128205128205129


Optimization Progress:  83%|████████▎ | 100/120 [13:39<10:36, 31.85s/pipeline]

Generation 4 - Current best internal CV score: 0.9128205128205129


                                                                              

Generation 5 - Current best internal CV score: 0.9128205128205129

Best pipeline: LogisticRegression(input_matrix, C=5.0, dual=True, penalty=l2)
0.7666666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:19<03:39,  2.74s/pipeline]

Generation 1 - Current best internal CV score: 0.876923076923077


Optimization Progress:  50%|█████     | 60/120 [00:42<07:59,  8.00s/pipeline]

Generation 2 - Current best internal CV score: 0.876923076923077


Optimization Progress:  67%|██████▋   | 80/120 [01:17<05:59,  9.00s/pipeline]

Generation 3 - Current best internal CV score: 0.8871794871794872


Optimization Progress:  83%|████████▎ | 100/120 [01:44<04:09, 12.47s/pipeline]

Generation 4 - Current best internal CV score: 0.8923076923076924


                                                                              

Generation 5 - Current best internal CV score: 0.9025641025641026

Best pipeline: LogisticRegression(ExtraTreesClassifier(CombineDFs(input_matrix, DecisionTreeClassifier(RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.7500000000000001, min_samples_leaf=1, min_samples_split=15, n_estimators=100), criterion=gini, max_depth=8, min_samples_leaf=5, min_samples_split=12)), bootstrap=False, criterion=entropy, max_features=0.45, min_samples_leaf=12, min_samples_split=5, n_estimators=100), C=25.0, dual=True, penalty=l2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [01:54<34:03, 25.54s/pipeline] 

Generation 1 - Current best internal CV score: 0.9384615384615385


Optimization Progress:  50%|█████     | 60/120 [04:45<1:03:33, 63.56s/pipeline]

Generation 2 - Current best internal CV score: 0.9384615384615385


Optimization Progress:  68%|██████▊   | 81/120 [11:42<1:41:26, 156.07s/pipeline]

Generation 3 - Current best internal CV score: 0.9384615384615385


Optimization Progress:  84%|████████▍ | 101/120 [11:51<12:22, 39.05s/pipeline]  

Generation 4 - Current best internal CV score: 0.9384615384615385


                                                                              

Generation 5 - Current best internal CV score: 0.9384615384615385

Best pipeline: LinearSVC(input_matrix, C=0.5, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05)
0.7333333333333333


Optimization Progress:  33%|███▎      | 40/120 [02:15<1:17:43, 58.29s/pipeline]

Generation 1 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  50%|█████     | 60/120 [02:56<34:30, 34.52s/pipeline]  

Generation 2 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  67%|██████▋   | 80/120 [07:28<1:05:43, 98.59s/pipeline]

Generation 3 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  83%|████████▎ | 100/120 [10:02<18:55, 56.78s/pipeline] 

Generation 4 - Current best internal CV score: 0.8923076923076924


                                                                               

Generation 5 - Current best internal CV score: 0.8974358974358975

Best pipeline: LinearSVC(input_matrix, C=20.0, dual=False, loss=squared_hinge, penalty=l2, tol=0.1)
0.75


Optimization Progress:  34%|███▍      | 41/120 [05:09<2:01:28, 92.26s/pipeline]

Generation 1 - Current best internal CV score: 0.9076923076923077


Optimization Progress:  51%|█████     | 61/120 [05:17<46:38, 47.44s/pipeline]  

Generation 2 - Current best internal CV score: 0.9076923076923077


Optimization Progress:  68%|██████▊   | 81/120 [06:26<19:32, 30.07s/pipeline]

Generation 3 - Current best internal CV score: 0.9076923076923077


Optimization Progress:  84%|████████▍ | 101/120 [09:18<20:55, 66.09s/pipeline]

Generation 4 - Current best internal CV score: 0.9076923076923077


                                                                              

Generation 5 - Current best internal CV score: 0.917948717948718

Best pipeline: LinearSVC(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.05, min_samples_leaf=7, min_samples_split=18, n_estimators=100), C=10.0, dual=True, loss=squared_hinge, penalty=l2, tol=1e-05)
0.7333333333333333


Optimization Progress:  33%|███▎      | 40/120 [06:32<2:24:30, 108.38s/pipeline]

Generation 1 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  50%|█████     | 60/120 [09:11<1:24:31, 84.53s/pipeline] 

Generation 2 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  67%|██████▋   | 80/120 [10:29<24:31, 36.79s/pipeline]  

Generation 3 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  84%|████████▍ | 101/120 [15:41<35:15, 111.36s/pipeline] 

Generation 4 - Current best internal CV score: 0.8435897435897436


                                                                               

Generation 5 - Current best internal CV score: 0.8615384615384615

Best pipeline: LogisticRegression(GradientBoostingClassifier(ZeroCount(input_matrix), learning_rate=0.1, max_depth=9, max_features=0.25, min_samples_leaf=12, min_samples_split=10, n_estimators=100, subsample=0.8500000000000001), C=25.0, dual=False, penalty=l2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [04:48<2:31:14, 113.44s/pipeline]

Generation 1 - Current best internal CV score: 0.8538461538461538


Optimization Progress:  50%|█████     | 60/120 [06:13<1:04:02, 64.04s/pipeline] 

Generation 2 - Current best internal CV score: 0.8615384615384615


Optimization Progress:  67%|██████▋   | 80/120 [07:13<32:51, 49.29s/pipeline]  

Generation 3 - Current best internal CV score: 0.8615384615384615


Optimization Progress:  84%|████████▍ | 101/120 [12:15<19:40, 62.11s/pipeline]

Generation 4 - Current best internal CV score: 0.8615384615384615


                                                                              

Generation 5 - Current best internal CV score: 0.8615384615384615

Best pipeline: LogisticRegression(input_matrix, C=5.0, dual=False, penalty=l1)
0.775


Optimization Progress:  34%|███▍      | 41/120 [06:19<2:42:29, 123.41s/pipeline]

Generation 1 - Current best internal CV score: 0.8128205128205128


Optimization Progress:  51%|█████     | 61/120 [06:43<46:28, 47.26s/pipeline]   

Generation 2 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  68%|██████▊   | 82/120 [11:44<24:40, 38.97s/pipeline]

Generation 3 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  85%|████████▌ | 102/120 [12:10<09:19, 31.10s/pipeline]

Generation 4 - Current best internal CV score: 0.8282051282051283


                                                                              

Generation 5 - Current best internal CV score: 0.8333333333333334

Best pipeline: LogisticRegression(GradientBoostingClassifier(input_matrix, learning_rate=0.001, max_depth=4, max_features=0.1, min_samples_leaf=1, min_samples_split=10, n_estimators=100, subsample=0.25), C=0.5, dual=True, penalty=l2)
0.6833333333333333


Optimization Progress:  34%|███▍      | 41/120 [06:14<3:31:39, 160.76s/pipeline]

Generation 1 - Current best internal CV score: 0.8256410256410256


Optimization Progress:  51%|█████     | 61/120 [08:54<2:04:18, 126.42s/pipeline]

Generation 2 - Current best internal CV score: 0.8256410256410256


Optimization Progress:  68%|██████▊   | 81/120 [10:30<37:29, 57.69s/pipeline]   

Generation 3 - Current best internal CV score: 0.8307692307692308


Optimization Progress:  84%|████████▍ | 101/120 [11:57<17:10, 54.25s/pipeline]

Generation 4 - Current best internal CV score: 0.8487179487179487


                                                                              

Generation 5 - Current best internal CV score: 0.8538461538461538

Best pipeline: LinearSVC(RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.55, min_samples_leaf=9, min_samples_split=5, n_estimators=100), C=0.001, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
0.7


Optimization Progress:  33%|███▎      | 40/120 [01:56<50:31, 37.90s/pipeline]  

Generation 1 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  50%|█████     | 60/120 [03:06<28:53, 28.90s/pipeline]

Generation 2 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  67%|██████▋   | 80/120 [04:18<09:54, 14.87s/pipeline]

Generation 3 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  84%|████████▍ | 101/120 [09:20<12:20, 38.97s/pipeline]

Generation 4 - Current best internal CV score: 0.8666666666666666


                                                                              

Generation 5 - Current best internal CV score: 0.8692307692307691

Best pipeline: GradientBoostingClassifier(LogisticRegression(input_matrix, C=0.5, dual=False, penalty=l1), learning_rate=0.1, max_depth=10, max_features=0.4, min_samples_leaf=12, min_samples_split=15, n_estimators=100, subsample=0.2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:41<06:33,  4.92s/pipeline]

Generation 1 - Current best internal CV score: 0.8384615384615385


Optimization Progress:  50%|█████     | 60/120 [01:10<06:36,  6.61s/pipeline]

Generation 2 - Current best internal CV score: 0.8384615384615385


Optimization Progress:  67%|██████▋   | 80/120 [01:40<07:28, 11.21s/pipeline]

Generation 3 - Current best internal CV score: 0.8384615384615385


Optimization Progress:  83%|████████▎ | 100/120 [02:34<03:09,  9.48s/pipeline]

Generation 4 - Current best internal CV score: 0.8384615384615385


                                                                              

Generation 5 - Current best internal CV score: 0.8461538461538461

Best pipeline: LinearSVC(input_matrix, C=0.001, dual=True, loss=squared_hinge, penalty=l2, tol=0.1)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [05:59<2:10:08, 97.61s/pipeline]

Generation 1 - Current best internal CV score: 0.8461538461538461


Optimization Progress:  51%|█████     | 61/120 [11:10<2:18:47, 141.15s/pipeline]

Generation 2 - Current best internal CV score: 0.864102564102564


Optimization Progress:  68%|██████▊   | 82/120 [16:12<1:02:53, 99.31s/pipeline] 

Generation 3 - Current best internal CV score: 0.864102564102564


Optimization Progress:  85%|████████▌ | 102/120 [17:45<10:02, 33.49s/pipeline] 

Generation 4 - Current best internal CV score: 0.864102564102564


                                                                              

Generation 5 - Current best internal CV score: 0.864102564102564

Best pipeline: LogisticRegression(Normalizer(input_matrix, norm=l2), C=20.0, dual=False, penalty=l1)
0.7666666666666667


Optimization Progress:  33%|███▎      | 40/120 [07:23<2:41:53, 121.42s/pipeline]

Generation 1 - Current best internal CV score: 0.823076923076923


Optimization Progress:  50%|█████     | 60/120 [08:30<1:44:54, 104.90s/pipeline]

Generation 2 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  67%|██████▋   | 80/120 [09:33<46:53, 70.34s/pipeline]   

Generation 3 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  83%|████████▎ | 100/120 [10:30<17:08, 51.44s/pipeline]

Generation 4 - Current best internal CV score: 0.8333333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.8358974358974358

Best pipeline: LogisticRegression(RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.1, min_samples_leaf=20, min_samples_split=4, n_estimators=100), C=25.0, dual=False, penalty=l1)
0.725


Optimization Progress:  34%|███▍      | 41/120 [07:16<1:32:02, 69.90s/pipeline]

Generation 1 - Current best internal CV score: 0.8282051282051281


Optimization Progress:  51%|█████     | 61/120 [09:30<1:02:39, 63.72s/pipeline]

Generation 2 - Current best internal CV score: 0.8384615384615385


Optimization Progress:  68%|██████▊   | 81/120 [09:52<15:39, 24.09s/pipeline]  

Generation 3 - Current best internal CV score: 0.8384615384615385


Optimization Progress:  84%|████████▍ | 101/120 [10:23<06:18, 19.93s/pipeline]

Generation 4 - Current best internal CV score: 0.8384615384615385


                                                                              

Generation 5 - Current best internal CV score: 0.8384615384615385

Best pipeline: LinearSVC(GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=10, max_features=0.2, min_samples_leaf=5, min_samples_split=6, n_estimators=100, subsample=0.15000000000000002), C=10.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.01)
0.725


Optimization Progress:  33%|███▎      | 40/120 [02:38<1:13:28, 55.11s/pipeline]

Generation 1 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  51%|█████     | 61/120 [08:01<2:01:36, 123.67s/pipeline]

Generation 2 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  68%|██████▊   | 81/120 [10:18<53:54, 82.93s/pipeline]   

Generation 3 - Current best internal CV score: 0.8435897435897436


Optimization Progress:  85%|████████▌ | 102/120 [15:19<25:43, 85.77s/pipeline]

Generation 4 - Current best internal CV score: 0.8435897435897436


                                                                              

Generation 5 - Current best internal CV score: 0.8435897435897436

Best pipeline: LogisticRegression(SelectFwe(input_matrix, alpha=0.005), C=5.0, dual=False, penalty=l2)
0.6916666666666667


In [6]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)

    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 18 (of 24 )
Accuary:  75.0


  if diff:
  if diff:


Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 20 (of 24 )
Accuary:  83.33333333333334
Number of matches: 16 (of 24 )
Accuary:  66.66666666666666
Number of matches: 20 (of 24 )
Accuary:  83.33333333333334
Number of matches: 35 (of 48 )
Accuary:  72.91666666666666


  if diff:
  if diff:


Number of matches: 37 (of 48 )
Accuary:  77.08333333333334
Number of matches: 30 (of 48 )
Accuary:  62.5
Number of matches: 35 (of 48 )
Accuary:  72.91666666666666
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 34 (of 48 )
Accuary:  70.83333333333334
Number of matches: 40 (of 48 )
Accuary:  83.33333333333334
Number of matches: 41 (of 48 )
Accuary:  85.41666666666666
Number of matches: 35 (of 48 )
Accuary:  72.91666666666666
Number of matches: 39 (of 48 )
Accuary:  81.25
Number of matches: 43 (of 60 )
Accuary:  71.66666666666667
Number of matches: 45 (of 60 )
Accuary:  75.0
Number of matches: 40 (of 60 )
Accuary:  66.66666666666666
Number of matches: 43 (of 60 )
Accuary:  71.66666666666667
Number of matches: 46 (of 60 )
Accuary:  76.66666666666667
Number of matches: 46 (of 60 )
Accuary:  76.66666666666667
Number of matches: 44 (of 60 )
Accuary:  73.33333333333333
Number of matches: 44 (of 60 )
Accuary:  73.33333333333333
Number of matches: 45 (of 60 )
Accu

In [7]:
print(len(unmatched))
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

661
Comprehension/Rest 45
Comprehension/Syntax 179
Rest/Syntax 437


In [8]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [9]:
list1, list2, list3, list4 = split_list(collectAccuracies,numAggLevels)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_allROIs','4_groups_allROIs','5_groups_allROIs','10_groups_allROIs']

collectedAccuracies.to_csv('triangulate_AllROIs.csv',sep=';',decimal='.', index=False)