In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('data_task_groups_roi_all.csv',sep=';',decimal='.')


   
# recode labels to numeric values so t_pot can handle them
# alphabetically: 0: C, 1: R, 2, S
z_data['task'] = z_data['task'].replace({'C': 0, 'R': 1, 'S': 2})

numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 12 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)

In [4]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [5]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [6]:
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high

In [7]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 40
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'allROIS/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'

    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [00:07<01:49,  1.37s/pipeline]

Generation 1 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  50%|█████     | 60/120 [00:10<01:00,  1.01s/pipeline]

Generation 2 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  68%|██████▊   | 81/120 [05:13<59:08, 90.99s/pipeline]  

Generation 3 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  84%|████████▍ | 101/120 [05:18<14:28, 45.68s/pipeline]

Generation 4 - Current best internal CV score: 0.9566666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.96

Best pipeline: GaussianNB(Normalizer(input_matrix, norm=l2))
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [02:42<34:29, 25.87s/pipeline]  

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [02:49<13:10, 13.18s/pipeline]

Generation 2 - Current best internal CV score: 0.96


Optimization Progress:  67%|██████▋   | 80/120 [03:20<07:20, 11.02s/pipeline]

Generation 3 - Current best internal CV score: 0.96


Optimization Progress:  83%|████████▎ | 100/120 [03:54<03:23, 10.18s/pipeline]

Generation 4 - Current best internal CV score: 0.96


                                                                              

Generation 5 - Current best internal CV score: 0.96

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
1.0


Optimization Progress:  33%|███▎      | 40/120 [00:15<03:11,  2.40s/pipeline]

Generation 1 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  50%|█████     | 60/120 [02:00<11:32, 11.55s/pipeline]

Generation 2 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  67%|██████▋   | 80/120 [02:08<04:16,  6.42s/pipeline]

Generation 3 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  83%|████████▎ | 100/120 [02:14<01:20,  4.02s/pipeline]

Generation 4 - Current best internal CV score: 0.9266666666666665


                                                                              

Generation 5 - Current best internal CV score: 0.93

Best pipeline: GaussianNB(CombineDFs(Normalizer(input_matrix, norm=l1), input_matrix))
0.9583333333333334


Optimization Progress:  34%|███▍      | 41/120 [08:32<4:14:33, 193.34s/pipeline]

Generation 1 - Current best internal CV score: 0.9033333333333333


Optimization Progress:  51%|█████     | 61/120 [09:02<1:41:51, 103.59s/pipeline]

Generation 2 - Current best internal CV score: 0.9066666666666666


Optimization Progress:  68%|██████▊   | 82/120 [14:03<1:29:17, 140.98s/pipeline]

Generation 3 - Current best internal CV score: 0.9066666666666666


Optimization Progress:  85%|████████▌ | 102/120 [14:09<20:58, 69.90s/pipeline]  

Generation 4 - Current best internal CV score: 0.9066666666666666


                                                                              

Generation 5 - Current best internal CV score: 0.9066666666666666

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8, min_samples_leaf=3, min_samples_split=11, n_estimators=100)
0.875


Optimization Progress:  33%|███▎      | 40/120 [00:11<03:57,  2.97s/pipeline]

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  51%|█████     | 61/120 [05:13<1:30:08, 91.67s/pipeline]

Generation 2 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  68%|██████▊   | 81/120 [05:24<31:02, 47.76s/pipeline]  

Generation 3 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  84%|████████▍ | 101/120 [05:32<05:24, 17.07s/pipeline]

Generation 4 - Current best internal CV score: 0.9566666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9566666666666667

Best pipeline: BernoulliNB(input_matrix, alpha=0.1, fit_prior=False)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [03:04<1:19:48, 59.85s/pipeline]

Generation 1 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  52%|█████▎    | 63/120 [08:09<1:54:22, 120.40s/pipeline]

Generation 2 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  70%|███████   | 84/120 [13:17<1:20:05, 133.49s/pipeline]

Generation 3 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  87%|████████▋ | 104/120 [14:28<22:43, 85.22s/pipeline]  

Generation 4 - Current best internal CV score: 0.9433333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9433333333333334

Best pipeline: GaussianNB(input_matrix)
1.0


Optimization Progress:  33%|███▎      | 40/120 [01:22<29:09, 21.87s/pipeline]

Generation 1 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  50%|█████     | 60/120 [02:18<27:06, 27.12s/pipeline]

Generation 2 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  67%|██████▋   | 80/120 [03:05<11:55, 17.90s/pipeline]

Generation 3 - Current best internal CV score: 0.9433333333333334


Optimization Progress:  83%|████████▎ | 100/120 [03:13<03:06,  9.34s/pipeline]

Generation 4 - Current best internal CV score: 0.9433333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9433333333333334

Best pipeline: GaussianNB(input_matrix)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [02:31<1:36:55, 72.70s/pipeline] 

Generation 1 - Current best internal CV score: 0.9


Optimization Progress:  50%|█████     | 60/120 [02:34<25:09, 25.16s/pipeline]  

Generation 2 - Current best internal CV score: 0.9


Optimization Progress:  67%|██████▋   | 80/120 [02:58<16:42, 25.06s/pipeline]

Generation 3 - Current best internal CV score: 0.9


Optimization Progress:  83%|████████▎ | 100/120 [03:16<04:58, 14.93s/pipeline]

Generation 4 - Current best internal CV score: 0.9


                                                                              

Generation 5 - Current best internal CV score: 0.9066666666666666

Best pipeline: ExtraTreesClassifier(LinearSVC(input_matrix, C=0.001, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05), bootstrap=False, criterion=entropy, max_features=0.4, min_samples_leaf=18, min_samples_split=18, n_estimators=100)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:09<02:28,  1.86s/pipeline]

Generation 1 - Current best internal CV score: 0.89


Optimization Progress:  50%|█████     | 60/120 [00:16<01:28,  1.47s/pipeline]

Generation 2 - Current best internal CV score: 0.89


Optimization Progress:  67%|██████▋   | 80/120 [01:53<19:38, 29.47s/pipeline]

Generation 3 - Current best internal CV score: 0.89


Optimization Progress:  83%|████████▎ | 100/120 [02:19<04:15, 12.75s/pipeline]

Generation 4 - Current best internal CV score: 0.89


                                                                              

Generation 5 - Current best internal CV score: 0.89

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.2, min_samples_leaf=16, min_samples_split=2, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:35<08:08,  6.10s/pipeline]

Generation 1 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  50%|█████     | 60/120 [00:51<05:20,  5.35s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [01:07<02:42,  4.06s/pipeline]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [01:18<01:10,  3.53s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: BernoulliNB(input_matrix, alpha=0.1, fit_prior=False)
0.9166666666666666


Optimization Progress:  35%|███▌      | 42/120 [27:01<5:56:22, 274.14s/pipeline]

Generation 1 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  52%|█████▏    | 62/120 [27:43<2:15:36, 140.29s/pipeline]

Generation 2 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  68%|██████▊   | 82/120 [28:01<32:08, 50.75s/pipeline]   

Generation 3 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  85%|████████▌ | 102/120 [28:15<08:36, 28.69s/pipeline]

Generation 4 - Current best internal CV score: 0.9083333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9083333333333334

Best pipeline: GaussianNB(input_matrix)
1.0


Optimization Progress:  33%|███▎      | 40/120 [02:18<28:28, 21.36s/pipeline]

Generation 1 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  50%|█████     | 60/120 [02:45<18:13, 18.22s/pipeline]

Generation 2 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  68%|██████▊   | 81/120 [07:52<35:27, 54.54s/pipeline]

Generation 3 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  84%|████████▍ | 101/120 [09:35<15:29, 48.90s/pipeline]

Generation 4 - Current best internal CV score: 0.8976190476190476


                                                                              

Generation 5 - Current best internal CV score: 0.8976190476190476

Best pipeline: GaussianNB(FeatureAgglomeration(MaxAbsScaler(input_matrix), affinity=cosine, linkage=complete))
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [07:26<1:53:34, 85.18s/pipeline] 

Generation 1 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  50%|█████     | 60/120 [07:51<1:03:13, 63.23s/pipeline]

Generation 2 - Current best internal CV score: 0.925


Optimization Progress:  68%|██████▊   | 81/120 [12:55<49:48, 76.62s/pipeline]  

Generation 3 - Current best internal CV score: 0.925


Optimization Progress:  84%|████████▍ | 101/120 [13:11<08:49, 27.89s/pipeline]

Generation 4 - Current best internal CV score: 0.925


                                                                              

Generation 5 - Current best internal CV score: 0.925

Best pipeline: BernoulliNB(input_matrix, alpha=0.01, fit_prior=False)
0.9444444444444444


Optimization Progress:  33%|███▎      | 40/120 [00:24<06:15,  4.69s/pipeline]

Generation 1 - Current best internal CV score: 0.8547619047619047


Optimization Progress:  50%|█████     | 60/120 [02:10<33:57, 33.95s/pipeline]

Generation 2 - Current best internal CV score: 0.8547619047619047


Optimization Progress:  67%|██████▋   | 80/120 [02:27<12:43, 19.09s/pipeline]

Generation 3 - Current best internal CV score: 0.8547619047619047


Optimization Progress:  83%|████████▎ | 100/120 [04:27<15:01, 45.06s/pipeline]

Generation 4 - Current best internal CV score: 0.8726190476190476


                                                                              

Generation 5 - Current best internal CV score: 0.8726190476190476

Best pipeline: LinearSVC(MinMaxScaler(BernoulliNB(input_matrix, alpha=0.001, fit_prior=True)), C=0.5, dual=True, loss=squared_hinge, penalty=l2, tol=0.1)
0.9444444444444444


Optimization Progress:  34%|███▍      | 41/120 [05:10<1:02:19, 47.33s/pipeline]

Generation 1 - Current best internal CV score: 0.9059523809523811


Optimization Progress:  51%|█████     | 61/120 [05:21<16:54, 17.19s/pipeline]  

Generation 2 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  68%|██████▊   | 81/120 [05:28<08:12, 12.62s/pipeline]

Generation 3 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  85%|████████▌ | 102/120 [10:30<28:55, 96.39s/pipeline] 

Generation 4 - Current best internal CV score: 0.9166666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9166666666666667

Best pipeline: GaussianNB(Normalizer(input_matrix, norm=l1))
0.9166666666666666


Optimization Progress:  34%|███▍      | 41/120 [07:44<2:36:02, 118.51s/pipeline]

Generation 1 - Current best internal CV score: 0.8797619047619047


Optimization Progress:  51%|█████     | 61/120 [08:03<1:02:26, 63.50s/pipeline] 

Generation 2 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  68%|██████▊   | 81/120 [08:10<20:39, 31.77s/pipeline]  

Generation 3 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  85%|████████▌ | 102/120 [13:18<30:22, 101.23s/pipeline] 

Generation 4 - Current best internal CV score: 0.8976190476190476


                                                                               

Generation 5 - Current best internal CV score: 0.8976190476190476

Best pipeline: GaussianNB(RFE(input_matrix, criterion=entropy, max_features=0.7000000000000001, n_estimators=100, step=0.6000000000000001))
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [04:37<2:03:22, 92.53s/pipeline] 

Generation 1 - Current best internal CV score: 0.898809523809524


Optimization Progress:  50%|█████     | 60/120 [05:27<52:47, 52.79s/pipeline]  

Generation 2 - Current best internal CV score: 0.898809523809524


Optimization Progress:  67%|██████▋   | 80/120 [05:59<20:22, 30.56s/pipeline]

Generation 3 - Current best internal CV score: 0.898809523809524


Optimization Progress:  84%|████████▍ | 101/120 [11:04<07:21, 23.26s/pipeline]

Generation 4 - Current best internal CV score: 0.898809523809524


                                                                              

Generation 5 - Current best internal CV score: 0.898809523809524

Best pipeline: BernoulliNB(input_matrix, alpha=0.01, fit_prior=True)
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [00:42<08:07,  6.09s/pipeline]

Generation 1 - Current best internal CV score: 0.8797619047619047


Optimization Progress:  50%|█████     | 60/120 [01:09<08:33,  8.55s/pipeline]

Generation 2 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  67%|██████▋   | 80/120 [01:48<04:36,  6.92s/pipeline]

Generation 3 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  83%|████████▎ | 100/120 [02:30<05:52, 17.61s/pipeline]

Generation 4 - Current best internal CV score: 0.8904761904761903


                                                                              

Generation 5 - Current best internal CV score: 0.8904761904761903

Best pipeline: XGBClassifier(BernoulliNB(GaussianNB(input_matrix), alpha=100.0, fit_prior=False), learning_rate=1.0, max_depth=9, min_child_weight=6, n_estimators=100, nthread=1, subsample=1.0)


  if diff:


0.9444444444444444


Optimization Progress:  33%|███▎      | 40/120 [01:51<36:03, 27.04s/pipeline]

Generation 1 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  51%|█████     | 61/120 [07:05<59:16, 60.27s/pipeline]  

Generation 2 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  68%|██████▊   | 81/120 [07:23<20:15, 31.17s/pipeline]

Generation 3 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  84%|████████▍ | 101/120 [09:32<17:05, 53.95s/pipeline]

Generation 4 - Current best internal CV score: 0.8892857142857142


                                                                              

Generation 5 - Current best internal CV score: 0.8988095238095237

Best pipeline: GaussianNB(RFE(input_matrix, criterion=gini, max_features=0.8500000000000001, n_estimators=100, step=0.9500000000000001))
0.9444444444444444


Optimization Progress:  34%|███▍      | 41/120 [06:54<2:21:53, 107.76s/pipeline]

Generation 1 - Current best internal CV score: 0.8619047619047618


Optimization Progress:  51%|█████     | 61/120 [08:41<1:02:10, 63.23s/pipeline] 

Generation 2 - Current best internal CV score: 0.8619047619047618


Optimization Progress:  68%|██████▊   | 81/120 [09:14<14:02, 21.61s/pipeline]  

Generation 3 - Current best internal CV score: 0.8619047619047618


Optimization Progress:  84%|████████▍ | 101/120 [10:51<06:34, 20.75s/pipeline]

Generation 4 - Current best internal CV score: 0.870238095238095


                                                                              

Generation 5 - Current best internal CV score: 0.8797619047619047

Best pipeline: GaussianNB(Normalizer(input_matrix, norm=l1))
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [03:32<34:26, 25.83s/pipeline]  

Generation 1 - Current best internal CV score: 0.8629629629629629


Optimization Progress:  50%|█████     | 60/120 [05:36<36:44, 36.74s/pipeline]

Generation 2 - Current best internal CV score: 0.8696296296296296


Optimization Progress:  67%|██████▋   | 80/120 [05:56<13:50, 20.76s/pipeline]

Generation 3 - Current best internal CV score: 0.8696296296296296


Optimization Progress:  83%|████████▎ | 100/120 [09:04<09:38, 28.93s/pipeline]

Generation 4 - Current best internal CV score: 0.8770370370370368


                                                                              

Generation 5 - Current best internal CV score: 0.8770370370370368

Best pipeline: XGBClassifier(BernoulliNB(RobustScaler(input_matrix), alpha=0.001, fit_prior=False), learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.35000000000000003)


  if diff:


0.8541666666666666


Optimization Progress:  34%|███▍      | 41/120 [07:41<2:24:31, 109.77s/pipeline]

Generation 1 - Current best internal CV score: 0.8481481481481481


Optimization Progress:  52%|█████▏    | 62/120 [12:44<2:19:46, 144.59s/pipeline]

Generation 2 - Current best internal CV score: 0.8511111111111112


Optimization Progress:  68%|██████▊   | 82/120 [12:58<45:44, 72.22s/pipeline]   

Generation 3 - Current best internal CV score: 0.8511111111111112


Optimization Progress:  85%|████████▌ | 102/120 [14:19<17:51, 59.53s/pipeline]

Generation 4 - Current best internal CV score: 0.8622222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.8622222222222222

Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=True, penalty=l2)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:41<20:42, 15.53s/pipeline]

Generation 1 - Current best internal CV score: 0.8911111111111112


Optimization Progress:  50%|█████     | 60/120 [00:47<08:32,  8.54s/pipeline]

Generation 2 - Current best internal CV score: 0.8911111111111112


Optimization Progress:  67%|██████▋   | 80/120 [01:14<05:21,  8.04s/pipeline]

Generation 3 - Current best internal CV score: 0.8911111111111112


Optimization Progress:  83%|████████▎ | 100/120 [02:23<08:09, 24.50s/pipeline]

Generation 4 - Current best internal CV score: 0.8985185185185186


                                                                              

Generation 5 - Current best internal CV score: 0.9044444444444444

Best pipeline: BernoulliNB(RFE(input_matrix, criterion=entropy, max_features=1.0, n_estimators=100, step=0.6000000000000001), alpha=0.001, fit_prior=False)
0.875


Optimization Progress:  33%|███▎      | 40/120 [02:25<48:14, 36.19s/pipeline]  

Generation 1 - Current best internal CV score: 0.8711111111111112


Optimization Progress:  50%|█████     | 60/120 [02:37<15:35, 15.60s/pipeline]

Generation 2 - Current best internal CV score: 0.8777777777777779


Optimization Progress:  67%|██████▋   | 80/120 [02:51<04:07,  6.19s/pipeline]

Generation 3 - Current best internal CV score: 0.8777777777777779


Optimization Progress:  83%|████████▎ | 100/120 [03:47<04:56, 14.82s/pipeline]

Generation 4 - Current best internal CV score: 0.8777777777777779


                                                                              

Generation 5 - Current best internal CV score: 0.8844444444444445

Best pipeline: RandomForestClassifier(LogisticRegression(FeatureAgglomeration(input_matrix, affinity=euclidean, linkage=ward), C=0.001, dual=False, penalty=l2), bootstrap=False, criterion=gini, max_features=0.4, min_samples_leaf=11, min_samples_split=4, n_estimators=100)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:31<09:59,  7.50s/pipeline]

Generation 1 - Current best internal CV score: 0.8644444444444443


Optimization Progress:  50%|█████     | 60/120 [00:44<04:20,  4.35s/pipeline]

Generation 2 - Current best internal CV score: 0.8644444444444443


Optimization Progress:  67%|██████▋   | 80/120 [03:46<14:08, 21.21s/pipeline]

Generation 3 - Current best internal CV score: 0.8644444444444443


Optimization Progress:  83%|████████▎ | 100/120 [04:12<04:14, 12.74s/pipeline]

Generation 4 - Current best internal CV score: 0.8644444444444443


                                                                              

Generation 5 - Current best internal CV score: 0.8644444444444443

Best pipeline: GaussianNB(input_matrix)
0.875


Optimization Progress:  33%|███▎      | 40/120 [01:30<31:02, 23.28s/pipeline]  

Generation 1 - Current best internal CV score: 0.8629629629629629


Optimization Progress:  51%|█████     | 61/120 [06:33<45:48, 46.58s/pipeline]

Generation 2 - Current best internal CV score: 0.8629629629629629


Optimization Progress:  68%|██████▊   | 82/120 [11:49<44:14, 69.84s/pipeline]  

Generation 3 - Current best internal CV score: 0.8681481481481482


Optimization Progress:  85%|████████▌ | 102/120 [12:46<10:44, 35.79s/pipeline]

Generation 4 - Current best internal CV score: 0.8822222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.8822222222222222

Best pipeline: LinearSVC(BernoulliNB(input_matrix, alpha=0.01, fit_prior=False), C=0.01, dual=True, loss=hinge, penalty=l2, tol=1e-05)
0.875


Optimization Progress:  34%|███▍      | 41/120 [06:56<3:12:10, 145.96s/pipeline]

Generation 1 - Current best internal CV score: 0.8755555555555556


Optimization Progress:  51%|█████     | 61/120 [07:13<1:14:48, 76.08s/pipeline] 

Generation 2 - Current best internal CV score: 0.8814814814814815


Optimization Progress:  68%|██████▊   | 81/120 [08:47<23:16, 35.81s/pipeline]  

Generation 3 - Current best internal CV score: 0.8896296296296298


Optimization Progress:  84%|████████▍ | 101/120 [11:37<11:12, 35.41s/pipeline]

Generation 4 - Current best internal CV score: 0.8896296296296298


                                                                              

Generation 5 - Current best internal CV score: 0.8896296296296298

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=1.0, dual=False, penalty=l2)
0.9375


Optimization Progress:  33%|███▎      | 40/120 [00:56<20:09, 15.12s/pipeline]

Generation 1 - Current best internal CV score: 0.882962962962963


Optimization Progress:  50%|█████     | 60/120 [02:46<40:09, 40.16s/pipeline]

Generation 2 - Current best internal CV score: 0.882962962962963


Optimization Progress:  68%|██████▊   | 81/120 [08:14<1:16:39, 117.94s/pipeline]

Generation 3 - Current best internal CV score: 0.882962962962963


Optimization Progress:  84%|████████▍ | 101/120 [08:31<19:04, 60.26s/pipeline]  

Generation 4 - Current best internal CV score: 0.882962962962963


                                                                              

Generation 5 - Current best internal CV score: 0.882962962962963

Best pipeline: LogisticRegression(input_matrix, C=1.0, dual=False, penalty=l2)
0.875


Optimization Progress:  33%|███▎      | 40/120 [01:22<46:08, 34.60s/pipeline]  

Generation 1 - Current best internal CV score: 0.8948148148148147


Optimization Progress:  51%|█████     | 61/120 [06:26<1:45:59, 107.79s/pipeline]

Generation 2 - Current best internal CV score: 0.8948148148148147


Optimization Progress:  68%|██████▊   | 81/120 [09:19<41:01, 63.12s/pipeline]   

Generation 3 - Current best internal CV score: 0.8948148148148147


Optimization Progress:  85%|████████▌ | 102/120 [15:59<45:14, 150.80s/pipeline] 

Generation 4 - Current best internal CV score: 0.8948148148148147


                                                                               

Generation 5 - Current best internal CV score: 0.8948148148148147

Best pipeline: GaussianNB(LinearSVC(input_matrix, C=15.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.0001))
0.8125


Optimization Progress:  33%|███▎      | 40/120 [00:37<13:21, 10.02s/pipeline]

Generation 1 - Current best internal CV score: 0.8555555555555555


Optimization Progress:  50%|█████     | 60/120 [00:59<08:01,  8.03s/pipeline]

Generation 2 - Current best internal CV score: 0.882962962962963


Optimization Progress:  67%|██████▋   | 80/120 [01:07<02:53,  4.35s/pipeline]

Generation 3 - Current best internal CV score: 0.882962962962963


Optimization Progress:  84%|████████▍ | 101/120 [06:11<08:10, 25.81s/pipeline]

Generation 4 - Current best internal CV score: 0.882962962962963


                                                                              

Generation 5 - Current best internal CV score: 0.882962962962963

Best pipeline: LogisticRegression(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.45, min_samples_leaf=13, min_samples_split=4, n_estimators=100), C=20.0, dual=False, penalty=l2)
0.8958333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:22<1:13:19, 55.69s/pipeline] 

Generation 1 - Current best internal CV score: 0.8374603174603175


Optimization Progress:  51%|█████     | 61/120 [06:06<51:14, 52.11s/pipeline]  

Generation 2 - Current best internal CV score: 0.8374603174603175


Optimization Progress:  68%|██████▊   | 81/120 [06:53<21:06, 32.48s/pipeline]

Generation 3 - Current best internal CV score: 0.8374603174603175


Optimization Progress:  86%|████████▌ | 103/120 [15:49<13:46, 48.59s/pipeline]

Generation 4 - Current best internal CV score: 0.8377777777777778


                                                                              

Generation 5 - Current best internal CV score: 0.8377777777777778

Best pipeline: LogisticRegression(PCA(GaussianNB(input_matrix), iterated_power=2, svd_solver=randomized), C=15.0, dual=False, penalty=l2)
0.8055555555555556


Optimization Progress:  33%|███▎      | 40/120 [01:20<26:27, 19.85s/pipeline]

Generation 1 - Current best internal CV score: 0.8431746031746032


Optimization Progress:  50%|█████     | 60/120 [01:40<12:28, 12.48s/pipeline]

Generation 2 - Current best internal CV score: 0.8434920634920635


Optimization Progress:  67%|██████▋   | 80/120 [01:52<04:52,  7.32s/pipeline]

Generation 3 - Current best internal CV score: 0.8482539682539683


Optimization Progress:  83%|████████▎ | 100/120 [02:07<01:21,  4.05s/pipeline]

Generation 4 - Current best internal CV score: 0.8482539682539683


                                                                              

Generation 5 - Current best internal CV score: 0.8482539682539683

Best pipeline: LinearSVC(input_matrix, C=0.5, dual=True, loss=hinge, penalty=l2, tol=0.1)
0.8194444444444444


Optimization Progress:  34%|███▍      | 41/120 [06:11<2:43:34, 124.24s/pipeline]

Generation 1 - Current best internal CV score: 0.8425396825396826


Optimization Progress:  51%|█████     | 61/120 [06:24<42:43, 43.45s/pipeline]   

Generation 2 - Current best internal CV score: 0.8425396825396826


Optimization Progress:  68%|██████▊   | 81/120 [06:31<20:12, 31.09s/pipeline]

Generation 3 - Current best internal CV score: 0.8473015873015873


Optimization Progress:  84%|████████▍ | 101/120 [06:52<05:48, 18.33s/pipeline]

Generation 4 - Current best internal CV score: 0.8473015873015873


                                                                              

Generation 5 - Current best internal CV score: 0.8473015873015873

Best pipeline: LogisticRegression(GradientBoostingClassifier(input_matrix, learning_rate=0.001, max_depth=8, max_features=0.3, min_samples_leaf=19, min_samples_split=19, n_estimators=100, subsample=0.05), C=0.01, dual=True, penalty=l2)
0.7638888888888888


Optimization Progress:  33%|███▎      | 40/120 [02:08<27:49, 20.86s/pipeline] 

Generation 1 - Current best internal CV score: 0.833968253968254


Optimization Progress:  50%|█████     | 60/120 [02:45<13:42, 13.70s/pipeline]

Generation 2 - Current best internal CV score: 0.833968253968254


Optimization Progress:  67%|██████▋   | 80/120 [04:13<22:05, 33.14s/pipeline]

Generation 3 - Current best internal CV score: 0.8380952380952381


Optimization Progress:  84%|████████▍ | 101/120 [09:15<32:11, 101.64s/pipeline] 

Generation 4 - Current best internal CV score: 0.8425396825396826


                                                                               

Generation 5 - Current best internal CV score: 0.8425396825396826

Best pipeline: GaussianNB(RFE(VarianceThreshold(input_matrix, threshold=0.05), criterion=entropy, max_features=0.9000000000000001, n_estimators=100, step=0.45))
0.7777777777777778


Optimization Progress:  33%|███▎      | 40/120 [02:20<36:29, 27.37s/pipeline] 

Generation 1 - Current best internal CV score: 0.8473015873015873


Optimization Progress:  50%|█████     | 60/120 [02:51<22:29, 22.50s/pipeline]

Generation 2 - Current best internal CV score: 0.8473015873015873


Optimization Progress:  67%|██████▋   | 80/120 [04:16<15:51, 23.78s/pipeline]

Generation 3 - Current best internal CV score: 0.8473015873015873


Optimization Progress:  83%|████████▎ | 100/120 [04:44<05:16, 15.83s/pipeline]

Generation 4 - Current best internal CV score: 0.8473015873015873


                                                                              

Generation 5 - Current best internal CV score: 0.8612698412698412

Best pipeline: LogisticRegression(LinearSVC(input_matrix, C=0.1, dual=True, loss=hinge, penalty=l2, tol=0.01), C=0.1, dual=False, penalty=l2)
0.8194444444444444


Optimization Progress:  33%|███▎      | 40/120 [00:22<09:18,  6.98s/pipeline]

Generation 1 - Current best internal CV score: 0.847936507936508


Optimization Progress:  50%|█████     | 60/120 [00:40<08:32,  8.54s/pipeline]

Generation 2 - Current best internal CV score: 0.847936507936508


Optimization Progress:  67%|██████▋   | 80/120 [01:06<07:02, 10.57s/pipeline]

Generation 3 - Current best internal CV score: 0.847936507936508


Optimization Progress:  83%|████████▎ | 100/120 [01:23<02:19,  7.00s/pipeline]

Generation 4 - Current best internal CV score: 0.8482539682539683


                                                                              

Generation 5 - Current best internal CV score: 0.8482539682539683

Best pipeline: GaussianNB(Normalizer(input_matrix, norm=l1))
0.8055555555555556


Optimization Progress:  34%|███▍      | 41/120 [05:39<2:21:48, 107.71s/pipeline]

Generation 1 - Current best internal CV score: 0.8488888888888889


Optimization Progress:  51%|█████     | 61/120 [06:54<57:34, 58.54s/pipeline]   

Generation 2 - Current best internal CV score: 0.8488888888888889


Optimization Progress:  68%|██████▊   | 81/120 [07:10<20:15, 31.16s/pipeline]

Generation 3 - Current best internal CV score: 0.8571428571428571


Optimization Progress:  84%|████████▍ | 101/120 [07:33<06:54, 21.84s/pipeline]

Generation 4 - Current best internal CV score: 0.8571428571428571


                                                                              

Generation 5 - Current best internal CV score: 0.8571428571428571

Best pipeline: XGBClassifier(LogisticRegression(input_matrix, C=25.0, dual=False, penalty=l2), learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=100, nthread=1, subsample=0.9500000000000001)


  if diff:


0.8055555555555556


Optimization Progress:  34%|███▍      | 41/120 [05:19<2:05:06, 95.02s/pipeline]

Generation 1 - Current best internal CV score: 0.8107936507936507


Optimization Progress:  51%|█████     | 61/120 [06:11<47:03, 47.86s/pipeline]  

Generation 2 - Current best internal CV score: 0.8244444444444443


Optimization Progress:  68%|██████▊   | 81/120 [07:11<22:14, 34.22s/pipeline]

Generation 3 - Current best internal CV score: 0.8244444444444443


Optimization Progress:  84%|████████▍ | 101/120 [07:28<05:51, 18.48s/pipeline]

Generation 4 - Current best internal CV score: 0.8244444444444443


                                                                              

Generation 5 - Current best internal CV score: 0.8244444444444443

Best pipeline: LogisticRegression(MaxAbsScaler(input_matrix), C=25.0, dual=False, penalty=l1)
0.8611111111111112


Optimization Progress:  33%|███▎      | 40/120 [01:23<35:00, 26.26s/pipeline] 

Generation 1 - Current best internal CV score: 0.8755555555555556


Optimization Progress:  50%|█████     | 60/120 [01:56<17:44, 17.74s/pipeline]

Generation 2 - Current best internal CV score: 0.8755555555555556


Optimization Progress:  68%|██████▊   | 81/120 [07:22<1:09:08, 106.36s/pipeline]

Generation 3 - Current best internal CV score: 0.8755555555555556


Optimization Progress:  86%|████████▌ | 103/120 [12:43<19:26, 68.61s/pipeline]  

Generation 4 - Current best internal CV score: 0.8755555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.8755555555555556

Best pipeline: LinearSVC(input_matrix, C=5.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.1)
0.8472222222222222


Optimization Progress:  33%|███▎      | 40/120 [01:09<24:06, 18.08s/pipeline]

Generation 1 - Current best internal CV score: 0.8336507936507935


Optimization Progress:  50%|█████     | 60/120 [01:37<17:00, 17.01s/pipeline]

Generation 2 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  67%|██████▋   | 80/120 [02:06<11:16, 16.92s/pipeline]

Generation 3 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  83%|████████▎ | 100/120 [02:26<04:39, 13.98s/pipeline]

Generation 4 - Current best internal CV score: 0.8384126984126985


                                                                              

Generation 5 - Current best internal CV score: 0.8428571428571429

Best pipeline: LogisticRegression(LogisticRegression(input_matrix, C=0.5, dual=True, penalty=l2), C=10.0, dual=True, penalty=l2)
0.8333333333333334


In [8]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)

    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 24 (of 24 )
Accuary:  100.0
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 21 (of 24 )
Accuary:  87.5
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 24 (of 24 )
Accuary:  100.0
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 36 (of 36 )
Accuary:  100.0
Number of matches: 32 (of 36 )
Accuary:  88.88888888888889
Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 33 (of 36 )
Accuary:  91.66666666666666
Number of matches: 33 (of 36 )
Accuary:  91.66666666666666
Number of matches: 32 (of 36 )
Accuary:  88.88888888888889


  if diff:


Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 30 (of 36 )
Accuary:  83.33333333333334


  if diff:


Number of matches: 41 (of 48 )
Accuary:  85.41666666666666
Number of matches: 44 (of 48 )
Accuary:  91.66666666666666
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 40 (of 48 )
Accuary:  83.33333333333334
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 45 (of 48 )
Accuary:  93.75
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 39 (of 48 )
Accuary:  81.25
Number of matches: 43 (of 48 )
Accuary:  89.58333333333334
Number of matches: 58 (of 72 )
Accuary:  80.55555555555556
Number of matches: 59 (of 72 )
Accuary:  81.94444444444444
Number of matches: 55 (of 72 )
Accuary:  76.38888888888889
Number of matches: 57 (of 72 )
Accuary:  79.16666666666666
Number of matches: 59 (of 72 )
Accuary:  81.94444444444444
Number of matches: 58 (of 72 )
Accuary:  80.55555555555556


  if diff:


Number of matches: 58 (of 72 )
Accuary:  80.55555555555556
Number of matches: 63 (of 72 )
Accuary:  87.5
Number of matches: 62 (of 72 )
Accuary:  86.11111111111111
Number of matches: 60 (of 72 )
Accuary:  83.33333333333334


In [9]:
print(len(unmatched))
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

236
Comprehension/Rest 25
Comprehension/Syntax 126
Rest/Syntax 85


In [10]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [11]:
list1, list2, list3, list4 = split_list(collectAccuracies,4)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_allROIs','3_groups_allROIs','4_groups_allROIs','6_groups_allROIs']

collectedAccuracies.to_csv('triangulate_AllROIs.csv',sep=';',decimal='.', index=False)