In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('data_task_groups_roi_deact.csv',sep=';',decimal='.')
z_data.drop(list(z_data.filter(regex = '_act')),axis = 1, inplace = True)

z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)
   
# recode labels to numeric values so t_pot can handle them
# alphabetically: 0: C, 1: R, 2, S
z_data['task'] = z_data['task'].replace({'C': 0, 'R': 1, 'S': 2})

numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 12 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [3]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [4]:
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high

In [5]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 40
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'Deact/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [00:24<11:45,  8.81s/pipeline]

Generation 1 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  51%|█████     | 61/120 [05:26<1:33:17, 94.87s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  68%|██████▊   | 81/120 [09:39<54:45, 84.25s/pipeline]  

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  84%|████████▍ | 101/120 [10:33<12:41, 40.06s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: DecisionTreeClassifier(FeatureAgglomeration(input_matrix, affinity=l2, linkage=complete), criterion=entropy, max_depth=4, min_samples_leaf=1, min_samples_split=3)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [03:30<1:25:26, 64.08s/pipeline]

Generation 1 - Current best internal CV score: 0.9199999999999999


Optimization Progress:  50%|█████     | 60/120 [04:23<37:47, 37.80s/pipeline]  

Generation 2 - Current best internal CV score: 0.9199999999999999


Optimization Progress:  67%|██████▋   | 80/120 [04:53<14:21, 21.53s/pipeline]

Generation 3 - Current best internal CV score: 0.9200000000000002


Optimization Progress:  84%|████████▍ | 101/120 [09:55<16:38, 52.58s/pipeline]

Generation 4 - Current best internal CV score: 0.9200000000000002


                                                                              

Generation 5 - Current best internal CV score: 0.9333333333333332

Best pipeline: GaussianNB(CombineDFs(ZeroCount(input_matrix), Normalizer(input_matrix, norm=l2)))
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:12<22:31, 16.89s/pipeline]

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [02:00<19:47, 19.80s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [04:27<21:06, 31.66s/pipeline]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [04:34<04:15, 12.79s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: BernoulliNB(RFE(input_matrix, criterion=entropy, max_features=0.2, n_estimators=100, step=0.7500000000000001), alpha=0.01, fit_prior=True)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:51<26:35, 19.94s/pipeline]

Generation 1 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  50%|█████     | 60/120 [01:16<14:07, 14.13s/pipeline]

Generation 2 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  67%|██████▋   | 80/120 [01:42<07:06, 10.67s/pipeline]

Generation 3 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  83%|████████▎ | 100/120 [03:07<10:08, 30.43s/pipeline]

Generation 4 - Current best internal CV score: 0.9266666666666665


                                                                              

Generation 5 - Current best internal CV score: 0.9266666666666665

Best pipeline: GaussianNB(input_matrix)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:20<02:53,  2.17s/pipeline]

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [00:25<01:48,  1.81s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [01:55<06:38,  9.95s/pipeline]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [02:11<03:53, 11.69s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: GaussianNB(input_matrix)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:03<01:28,  1.10s/pipeline]

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [00:11<01:16,  1.28s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:14<00:32,  1.22pipeline/s]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [00:54<04:04, 12.24s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
0.9583333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:09<03:58,  2.98s/pipeline]

Generation 1 - Current best internal CV score: 0.93


Optimization Progress:  50%|█████     | 60/120 [00:14<02:21,  2.35s/pipeline]

Generation 2 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:49<04:12,  6.31s/pipeline]

Generation 3 - Current best internal CV score: 0.9566666666666667


Optimization Progress:  83%|████████▎ | 100/120 [05:13<09:48, 29.43s/pipeline]

Generation 4 - Current best internal CV score: 0.9566666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9566666666666667

Best pipeline: GaussianNB(RFE(input_matrix, criterion=gini, max_features=0.3, n_estimators=100, step=0.8500000000000001))
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:27<02:45,  2.07s/pipeline]

Generation 1 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  50%|█████     | 60/120 [00:52<02:44,  2.74s/pipeline]

Generation 2 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  67%|██████▋   | 80/120 [01:50<07:03, 10.58s/pipeline]

Generation 3 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  83%|████████▎ | 100/120 [01:58<02:06,  6.34s/pipeline]

Generation 4 - Current best internal CV score: 0.9166666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9166666666666667

Best pipeline: GaussianNB(input_matrix)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [01:55<55:29, 41.62s/pipeline]  

Generation 1 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  50%|█████     | 60/120 [02:04<21:45, 21.77s/pipeline]

Generation 2 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  67%|██████▋   | 80/120 [02:19<05:56,  8.91s/pipeline]

Generation 3 - Current best internal CV score: 0.9166666666666667


Optimization Progress:  83%|████████▎ | 100/120 [02:42<03:00,  9.01s/pipeline]

Generation 4 - Current best internal CV score: 0.9166666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9166666666666667

Best pipeline: GaussianNB(input_matrix)
1.0


Optimization Progress:  33%|███▎      | 40/120 [01:38<46:05, 34.56s/pipeline]  

Generation 1 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  50%|█████     | 60/120 [02:21<23:58, 23.98s/pipeline]

Generation 2 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  67%|██████▋   | 80/120 [03:04<13:48, 20.71s/pipeline]

Generation 3 - Current best internal CV score: 0.9266666666666665


Optimization Progress:  83%|████████▎ | 100/120 [04:21<06:54, 20.70s/pipeline]

Generation 4 - Current best internal CV score: 0.9266666666666665


                                                                              

Generation 5 - Current best internal CV score: 0.9266666666666665

Best pipeline: GaussianNB(input_matrix)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [03:46<1:06:37, 49.97s/pipeline]

Generation 1 - Current best internal CV score: 0.8821428571428573


Optimization Progress:  51%|█████     | 61/120 [08:49<1:04:10, 65.26s/pipeline]

Generation 2 - Current best internal CV score: 0.8821428571428573


Optimization Progress:  68%|██████▊   | 81/120 [09:38<23:47, 36.61s/pipeline]  

Generation 3 - Current best internal CV score: 0.9095238095238095


Optimization Progress:  84%|████████▍ | 101/120 [09:45<05:54, 18.65s/pipeline]

Generation 4 - Current best internal CV score: 0.9095238095238095


                                                                              

Generation 5 - Current best internal CV score: 0.9095238095238095

Best pipeline: GaussianNB(Normalizer(input_matrix, norm=l1))
0.8611111111111112


Optimization Progress:  33%|███▎      | 40/120 [00:47<16:21, 12.27s/pipeline]

Generation 1 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  50%|█████     | 60/120 [00:55<06:17,  6.29s/pipeline]

Generation 2 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  67%|██████▋   | 80/120 [01:21<04:35,  6.88s/pipeline]

Generation 3 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  83%|████████▎ | 100/120 [01:30<01:32,  4.61s/pipeline]

Generation 4 - Current best internal CV score: 0.8892857142857142


                                                                              

Generation 5 - Current best internal CV score: 0.8988095238095237

Best pipeline: BernoulliNB(Binarizer(input_matrix, threshold=0.15000000000000002), alpha=0.1, fit_prior=False)
0.8055555555555556


Optimization Progress:  34%|███▍      | 41/120 [05:15<3:16:45, 149.44s/pipeline]

Generation 1 - Current best internal CV score: 0.898809523809524


Optimization Progress:  51%|█████     | 61/120 [05:30<1:14:10, 75.43s/pipeline] 

Generation 2 - Current best internal CV score: 0.898809523809524


Optimization Progress:  68%|██████▊   | 82/120 [10:42<1:15:17, 118.87s/pipeline]

Generation 3 - Current best internal CV score: 0.898809523809524


Optimization Progress:  85%|████████▌ | 102/120 [11:17<15:20, 51.12s/pipeline]  

Generation 4 - Current best internal CV score: 0.898809523809524


                                                                              

Generation 5 - Current best internal CV score: 0.898809523809524

Best pipeline: BernoulliNB(RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.5, min_samples_leaf=4, min_samples_split=20, n_estimators=100), alpha=1.0, fit_prior=True)
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [00:12<02:30,  1.88s/pipeline]

Generation 1 - Current best internal CV score: 0.9261904761904762


Optimization Progress:  50%|█████     | 60/120 [00:39<05:16,  5.28s/pipeline]

Generation 2 - Current best internal CV score: 0.9261904761904762


Optimization Progress:  67%|██████▋   | 80/120 [01:03<04:09,  6.25s/pipeline]

Generation 3 - Current best internal CV score: 0.9261904761904762


Optimization Progress:  83%|████████▎ | 100/120 [01:30<02:15,  6.76s/pipeline]

Generation 4 - Current best internal CV score: 0.9261904761904762


                                                                              

Generation 5 - Current best internal CV score: 0.9261904761904762

Best pipeline: GaussianNB(input_matrix)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [01:16<11:52,  8.91s/pipeline]

Generation 1 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  50%|█████     | 60/120 [02:43<11:12, 11.20s/pipeline]

Generation 2 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  67%|██████▋   | 80/120 [04:58<32:14, 48.36s/pipeline]

Generation 3 - Current best internal CV score: 0.9083333333333334


Optimization Progress:  83%|████████▎ | 100/120 [05:24<09:09, 27.47s/pipeline]

Generation 4 - Current best internal CV score: 0.9083333333333334


                                                                              

Generation 5 - Current best internal CV score: 0.9083333333333334

Best pipeline: GaussianNB(input_matrix)
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [06:56<3:14:17, 145.71s/pipeline]

Generation 1 - Current best internal CV score: 0.8904761904761905


Optimization Progress:  50%|█████     | 60/120 [08:02<1:09:08, 69.14s/pipeline] 

Generation 2 - Current best internal CV score: 0.8904761904761905


Optimization Progress:  67%|██████▋   | 80/120 [08:36<18:03, 27.09s/pipeline]  

Generation 3 - Current best internal CV score: 0.8904761904761905


Optimization Progress:  84%|████████▍ | 101/120 [28:21<41:42, 131.73s/pipeline] 

Generation 4 - Current best internal CV score: 0.8904761904761905


                                                                               

Generation 5 - Current best internal CV score: 0.898809523809524

Best pipeline: DecisionTreeClassifier(FeatureAgglomeration(GaussianNB(RFE(input_matrix, criterion=entropy, max_features=0.6000000000000001, n_estimators=100, step=0.2)), affinity=manhattan, linkage=complete), criterion=entropy, max_depth=10, min_samples_leaf=12, min_samples_split=13)
0.9444444444444444


Optimization Progress:  33%|███▎      | 40/120 [01:02<28:47, 21.59s/pipeline]

Generation 1 - Current best internal CV score: 0.8773809523809524


Optimization Progress:  50%|█████     | 60/120 [01:35<15:29, 15.49s/pipeline]

Generation 2 - Current best internal CV score: 0.8773809523809524


Optimization Progress:  67%|██████▋   | 80/120 [02:46<17:40, 26.52s/pipeline]

Generation 3 - Current best internal CV score: 0.8773809523809524


Optimization Progress:  83%|████████▎ | 100/120 [03:04<04:45, 14.27s/pipeline]

Generation 4 - Current best internal CV score: 0.8952380952380953


                                                                              

Generation 5 - Current best internal CV score: 0.8952380952380953

Best pipeline: ExtraTreesClassifier(VarianceThreshold(PCA(input_matrix, iterated_power=1, svd_solver=randomized), threshold=0.0001), bootstrap=False, criterion=entropy, max_features=0.6500000000000001, min_samples_leaf=7, min_samples_split=16, n_estimators=100)
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [00:39<08:45,  6.57s/pipeline]

Generation 1 - Current best internal CV score: 0.8892857142857142


Optimization Progress:  50%|█████     | 60/120 [01:00<04:41,  4.69s/pipeline]

Generation 2 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  67%|██████▋   | 80/120 [01:08<02:34,  3.86s/pipeline]

Generation 3 - Current best internal CV score: 0.8976190476190476


Optimization Progress:  83%|████████▎ | 100/120 [01:15<00:41,  2.05s/pipeline]

Generation 4 - Current best internal CV score: 0.9059523809523811


                                                                              

Generation 5 - Current best internal CV score: 0.9059523809523811

Best pipeline: ExtraTreesClassifier(RFE(input_matrix, criterion=entropy, max_features=0.1, n_estimators=100, step=0.5), bootstrap=True, criterion=gini, max_features=0.6500000000000001, min_samples_leaf=16, min_samples_split=12, n_estimators=100)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:44<39:00, 29.26s/pipeline]

Generation 1 - Current best internal CV score: 0.8833333333333334


Optimization Progress:  50%|█████     | 60/120 [02:03<11:59, 11.99s/pipeline]

Generation 2 - Current best internal CV score: 0.8833333333333334


Optimization Progress:  67%|██████▋   | 80/120 [04:44<21:42, 32.57s/pipeline]

Generation 3 - Current best internal CV score: 0.8833333333333334


Optimization Progress:  84%|████████▍ | 101/120 [09:46<32:06, 101.40s/pipeline] 

Generation 4 - Current best internal CV score: 0.8833333333333334


                                                                               

Generation 5 - Current best internal CV score: 0.8833333333333334

Best pipeline: GaussianNB(BernoulliNB(input_matrix, alpha=1.0, fit_prior=True))
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:33<05:41,  4.27s/pipeline]

Generation 1 - Current best internal CV score: 0.9095238095238095


Optimization Progress:  50%|█████     | 60/120 [00:40<02:24,  2.41s/pipeline]

Generation 2 - Current best internal CV score: 0.9095238095238095


Optimization Progress:  67%|██████▋   | 80/120 [00:43<01:13,  1.84s/pipeline]

Generation 3 - Current best internal CV score: 0.9095238095238095


Optimization Progress:  83%|████████▎ | 100/120 [00:49<00:26,  1.32s/pipeline]

Generation 4 - Current best internal CV score: 0.9095238095238095


                                                                              

Generation 5 - Current best internal CV score: 0.919047619047619

Best pipeline: GaussianNB(RFE(input_matrix, criterion=gini, max_features=0.55, n_estimators=100, step=0.9500000000000001))
0.8888888888888888


Optimization Progress:  33%|███▎      | 40/120 [03:17<1:25:41, 64.27s/pipeline]

Generation 1 - Current best internal CV score: 0.8770370370370368


Optimization Progress:  51%|█████     | 61/120 [08:23<1:15:53, 77.18s/pipeline]

Generation 2 - Current best internal CV score: 0.8770370370370368


Optimization Progress:  68%|██████▊   | 81/120 [08:50<26:06, 40.17s/pipeline]  

Generation 3 - Current best internal CV score: 0.8770370370370368


Optimization Progress:  84%|████████▍ | 101/120 [09:23<09:06, 28.77s/pipeline]

Generation 4 - Current best internal CV score: 0.8770370370370368


                                                                              

Generation 5 - Current best internal CV score: 0.8770370370370368

Best pipeline: GaussianNB(input_matrix)
0.875


Optimization Progress:  33%|███▎      | 40/120 [03:50<1:55:56, 86.96s/pipeline]

Generation 1 - Current best internal CV score: 0.8762962962962962


Optimization Progress:  50%|█████     | 60/120 [04:05<33:54, 33.90s/pipeline]  

Generation 2 - Current best internal CV score: 0.8762962962962962


Optimization Progress:  67%|██████▋   | 80/120 [04:14<06:01,  9.05s/pipeline]

Generation 3 - Current best internal CV score: 0.8762962962962962


Optimization Progress:  83%|████████▎ | 100/120 [04:32<01:54,  5.72s/pipeline]

Generation 4 - Current best internal CV score: 0.8770370370370371


                                                                              

Generation 5 - Current best internal CV score: 0.8777777777777777

Best pipeline: ExtraTreesClassifier(SelectPercentile(XGBClassifier(input_matrix, learning_rate=0.1, max_depth=7, min_child_weight=9, n_estimators=100, nthread=1, subsample=0.05), percentile=28), bootstrap=False, criterion=entropy, max_features=0.6500000000000001, min_samples_leaf=6, min_samples_split=13, n_estimators=100)


  if diff:


0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [02:22<44:53, 33.66s/pipeline]  

Generation 1 - Current best internal CV score: 0.8703703703703705


Optimization Progress:  50%|█████     | 60/120 [03:16<32:31, 32.52s/pipeline]

Generation 2 - Current best internal CV score: 0.8770370370370371


Optimization Progress:  68%|██████▊   | 81/120 [08:19<1:06:01, 101.59s/pipeline]

Generation 3 - Current best internal CV score: 0.8770370370370371


Optimization Progress:  84%|████████▍ | 101/120 [08:36<17:15, 54.49s/pipeline]  

Generation 4 - Current best internal CV score: 0.8777777777777779


                                                                              

Generation 5 - Current best internal CV score: 0.8837037037037037

Best pipeline: GaussianNB(RFE(input_matrix, criterion=entropy, max_features=0.8500000000000001, n_estimators=100, step=0.6500000000000001))
0.875


Optimization Progress:  34%|███▍      | 41/120 [06:02<2:26:04, 110.94s/pipeline]

Generation 1 - Current best internal CV score: 0.8614814814814815


Optimization Progress:  51%|█████     | 61/120 [07:23<1:01:13, 62.27s/pipeline] 

Generation 2 - Current best internal CV score: 0.8622222222222222


Optimization Progress:  68%|██████▊   | 81/120 [07:32<14:28, 22.26s/pipeline]  

Generation 3 - Current best internal CV score: 0.8681481481481482


Optimization Progress:  84%|████████▍ | 101/120 [07:42<03:33, 11.22s/pipeline]

Generation 4 - Current best internal CV score: 0.8681481481481482


                                                                               

Generation 5 - Current best internal CV score: 0.8755555555555556

Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l2)
0.875


Optimization Progress:  34%|███▍      | 41/120 [05:19<3:17:41, 150.15s/pipeline]

Generation 1 - Current best internal CV score: 0.8770370370370371


Optimization Progress:  51%|█████     | 61/120 [07:45<1:55:06, 117.06s/pipeline]

Generation 2 - Current best internal CV score: 0.8770370370370371


Optimization Progress:  68%|██████▊   | 82/120 [13:01<1:06:20, 104.76s/pipeline]

Generation 3 - Current best internal CV score: 0.8770370370370371


Optimization Progress:  86%|████████▌ | 103/120 [18:03<16:08, 56.99s/pipeline]  

Generation 4 - Current best internal CV score: 0.8770370370370371


                                                                              

Generation 5 - Current best internal CV score: 0.8770370370370371

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=False)
0.8125


Optimization Progress:  34%|███▍      | 41/120 [07:39<1:30:19, 68.60s/pipeline]

Generation 1 - Current best internal CV score: 0.9103703703703705


Optimization Progress:  51%|█████     | 61/120 [08:06<35:37, 36.22s/pipeline]  

Generation 2 - Current best internal CV score: 0.9103703703703705


Optimization Progress:  68%|██████▊   | 81/120 [08:19<09:35, 14.76s/pipeline]

Generation 3 - Current best internal CV score: 0.9103703703703705


Optimization Progress:  84%|████████▍ | 101/120 [08:43<02:48,  8.88s/pipeline]

Generation 4 - Current best internal CV score: 0.9103703703703705


                                                                              

Generation 5 - Current best internal CV score: 0.9103703703703705

Best pipeline: BernoulliNB(input_matrix, alpha=0.01, fit_prior=True)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [02:36<1:36:03, 72.04s/pipeline]

Generation 1 - Current best internal CV score: 0.8896296296296295


Optimization Progress:  50%|█████     | 60/120 [02:50<27:12, 27.21s/pipeline]  

Generation 2 - Current best internal CV score: 0.8896296296296295


Optimization Progress:  67%|██████▋   | 80/120 [03:01<13:14, 19.86s/pipeline]

Generation 3 - Current best internal CV score: 0.8896296296296295


Optimization Progress:  83%|████████▎ | 100/120 [03:09<03:37, 10.88s/pipeline]

Generation 4 - Current best internal CV score: 0.8896296296296295


                                                                              

Generation 5 - Current best internal CV score: 0.8896296296296295

Best pipeline: LinearSVC(input_matrix, C=10.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
0.8541666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:42<23:22, 17.53s/pipeline]

Generation 1 - Current best internal CV score: 0.9044444444444444


Optimization Progress:  50%|█████     | 60/120 [01:55<13:40, 13.68s/pipeline]

Generation 2 - Current best internal CV score: 0.9044444444444444


Optimization Progress:  67%|██████▋   | 80/120 [04:04<19:17, 28.93s/pipeline]

Generation 3 - Current best internal CV score: 0.9044444444444444


Optimization Progress:  83%|████████▎ | 100/120 [04:19<06:08, 18.42s/pipeline]

Generation 4 - Current best internal CV score: 0.9044444444444444


                                                                              

Generation 5 - Current best internal CV score: 0.9044444444444444

Best pipeline: GaussianNB(input_matrix)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:24<07:12,  5.41s/pipeline]

Generation 1 - Current best internal CV score: 0.8837037037037037


Optimization Progress:  50%|█████     | 60/120 [00:38<06:34,  6.58s/pipeline]

Generation 2 - Current best internal CV score: 0.8896296296296298


Optimization Progress:  67%|██████▋   | 80/120 [00:56<05:40,  8.52s/pipeline]

Generation 3 - Current best internal CV score: 0.902962962962963


Optimization Progress:  84%|████████▍ | 101/120 [06:37<12:04, 38.14s/pipeline]

Generation 4 - Current best internal CV score: 0.9051851851851852


                                                                              

Generation 5 - Current best internal CV score: 0.9051851851851852

Best pipeline: GaussianNB(RFE(input_matrix, criterion=entropy, max_features=0.05, n_estimators=100, step=0.1))
0.875


Optimization Progress:  35%|███▌      | 42/120 [05:18<1:07:33, 51.97s/pipeline]

Generation 1 - Current best internal CV score: 0.8844444444444445


Optimization Progress:  52%|█████▏    | 62/120 [05:31<28:09, 29.13s/pipeline]  

Generation 2 - Current best internal CV score: 0.8844444444444445


Optimization Progress:  68%|██████▊   | 82/120 [06:06<12:44, 20.12s/pipeline]

Generation 3 - Current best internal CV score: 0.8844444444444445


Optimization Progress:  85%|████████▌ | 102/120 [06:15<02:45,  9.20s/pipeline]

Generation 4 - Current best internal CV score: 0.8844444444444445


                                                                              

Generation 5 - Current best internal CV score: 0.8844444444444445

Best pipeline: BernoulliNB(input_matrix, alpha=0.1, fit_prior=False)
0.8541666666666666


Optimization Progress:  35%|███▌      | 42/120 [10:06<5:11:11, 239.38s/pipeline]

Generation 1 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  52%|█████▏    | 62/120 [11:41<2:20:52, 145.73s/pipeline]

Generation 2 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  69%|██████▉   | 83/120 [16:51<1:12:43, 117.92s/pipeline]

Generation 3 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  86%|████████▌ | 103/120 [17:09<11:57, 42.22s/pipeline]  

Generation 4 - Current best internal CV score: 0.8384126984126985


                                                                              

Generation 5 - Current best internal CV score: 0.8473015873015873

Best pipeline: LinearSVC(RFE(input_matrix, criterion=entropy, max_features=0.3, n_estimators=100, step=0.7500000000000001), C=0.1, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:31<24:42, 18.54s/pipeline]

Generation 1 - Current best internal CV score: 0.8298412698412699


Optimization Progress:  51%|█████     | 61/120 [06:32<1:37:38, 99.30s/pipeline]

Generation 2 - Current best internal CV score: 0.8384126984126985


Optimization Progress:  68%|██████▊   | 81/120 [08:07<40:38, 62.53s/pipeline]  

Generation 3 - Current best internal CV score: 0.8387301587301588


Optimization Progress:  84%|████████▍ | 101/120 [08:27<10:35, 33.43s/pipeline]

Generation 4 - Current best internal CV score: 0.8482539682539683


                                                                               

Generation 5 - Current best internal CV score: 0.8561904761904762

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=25.0, dual=True, penalty=l2)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:03<14:30, 10.88s/pipeline]

Generation 1 - Current best internal CV score: 0.8285714285714285


Optimization Progress:  51%|█████     | 61/120 [06:12<1:36:02, 97.67s/pipeline]

Generation 2 - Current best internal CV score: 0.8285714285714285


Optimization Progress:  68%|██████▊   | 81/120 [07:05<25:20, 38.98s/pipeline]  

Generation 3 - Current best internal CV score: 0.847936507936508


Optimization Progress:  84%|████████▍ | 101/120 [07:49<10:03, 31.74s/pipeline]

Generation 4 - Current best internal CV score: 0.847936507936508


                                                                               

Generation 5 - Current best internal CV score: 0.847936507936508

Best pipeline: XGBClassifier(LinearSVC(input_matrix, C=0.5, dual=True, loss=hinge, penalty=l2, tol=0.01), learning_rate=0.1, max_depth=4, min_child_weight=11, n_estimators=100, nthread=1, subsample=0.9500000000000001)


  if diff:


0.8055555555555556


Optimization Progress:  33%|███▎      | 40/120 [02:06<39:55, 29.95s/pipeline]  

Generation 1 - Current best internal CV score: 0.8517460317460317


Optimization Progress:  50%|█████     | 60/120 [02:29<24:18, 24.31s/pipeline]

Generation 2 - Current best internal CV score: 0.8517460317460317


Optimization Progress:  67%|██████▋   | 80/120 [03:27<19:28, 29.22s/pipeline]

Generation 3 - Current best internal CV score: 0.8517460317460317


Optimization Progress:  84%|████████▍ | 101/120 [08:30<33:14, 105.00s/pipeline] 

Generation 4 - Current best internal CV score: 0.8517460317460317


                                                                               

Generation 5 - Current best internal CV score: 0.8517460317460317

Best pipeline: GaussianNB(input_matrix)
0.7777777777777778


Optimization Progress:  35%|███▌      | 42/120 [10:06<5:11:15, 239.43s/pipeline]

Generation 1 - Current best internal CV score: 0.8663492063492063


Optimization Progress:  52%|█████▏    | 62/120 [10:28<1:23:42, 86.60s/pipeline] 

Generation 2 - Current best internal CV score: 0.8752380952380954


Optimization Progress:  68%|██████▊   | 82/120 [10:45<40:01, 63.19s/pipeline]  

Generation 3 - Current best internal CV score: 0.8752380952380954


Optimization Progress:  86%|████████▌ | 103/120 [15:47<34:24, 121.45s/pipeline] 

Generation 4 - Current best internal CV score: 0.8752380952380954


                                                                               

Generation 5 - Current best internal CV score: 0.8796825396825397

Best pipeline: LogisticRegression(XGBClassifier(LinearSVC(input_matrix, C=10.0, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05), learning_rate=0.01, max_depth=10, min_child_weight=14, n_estimators=100, nthread=1, subsample=0.05), C=15.0, dual=True, penalty=l2)


  if diff:


0.8055555555555556


Optimization Progress:  36%|███▌      | 43/120 [10:09<5:08:34, 240.45s/pipeline]

Generation 1 - Current best internal CV score: 0.8561904761904762


Optimization Progress:  53%|█████▎    | 64/120 [15:21<1:37:18, 104.26s/pipeline]

Generation 2 - Current best internal CV score: 0.8561904761904762


Optimization Progress:  70%|███████   | 84/120 [15:39<44:51, 74.77s/pipeline]   

Generation 3 - Current best internal CV score: 0.8561904761904762


Optimization Progress:  87%|████████▋ | 104/120 [15:56<07:17, 27.37s/pipeline]

Generation 4 - Current best internal CV score: 0.8653968253968254


                                                                              

Generation 5 - Current best internal CV score: 0.8653968253968254

Best pipeline: GaussianNB(LinearSVC(GaussianNB(input_matrix), C=20.0, dual=True, loss=hinge, penalty=l2, tol=0.1))
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:52<29:46, 22.33s/pipeline] 

Generation 1 - Current best internal CV score: 0.8565079365079364


Optimization Progress:  50%|█████     | 60/120 [01:18<09:26,  9.44s/pipeline]

Generation 2 - Current best internal CV score: 0.8565079365079364


Optimization Progress:  67%|██████▋   | 80/120 [01:49<05:56,  8.92s/pipeline]

Generation 3 - Current best internal CV score: 0.8565079365079364


Optimization Progress:  83%|████████▎ | 100/120 [02:06<03:08,  9.41s/pipeline]

Generation 4 - Current best internal CV score: 0.8565079365079364


                                                                              

Generation 5 - Current best internal CV score: 0.8565079365079364

Best pipeline: BernoulliNB(input_matrix, alpha=0.1, fit_prior=False)
0.8405797101449275


Optimization Progress:  33%|███▎      | 40/120 [03:31<1:52:11, 84.15s/pipeline] 

Generation 1 - Current best internal CV score: 0.8657142857142857


Optimization Progress:  50%|█████     | 60/120 [03:56<34:02, 34.04s/pipeline]  

Generation 2 - Current best internal CV score: 0.8657142857142857


Optimization Progress:  68%|██████▊   | 81/120 [09:44<1:23:17, 128.13s/pipeline]

Generation 3 - Current best internal CV score: 0.8657142857142857


Optimization Progress:  84%|████████▍ | 101/120 [10:18<20:56, 66.12s/pipeline]  

Generation 4 - Current best internal CV score: 0.8657142857142857


                                                                              

Generation 5 - Current best internal CV score: 0.8663492063492063

Best pipeline: LogisticRegression(MaxAbsScaler(input_matrix), C=25.0, dual=False, penalty=l2)
0.8115942028985508


Optimization Progress:  33%|███▎      | 40/120 [05:04<2:37:19, 117.99s/pipeline]

Generation 1 - Current best internal CV score: 0.8425396825396826


Optimization Progress:  50%|█████     | 60/120 [05:29<59:34, 59.57s/pipeline]   

Generation 2 - Current best internal CV score: 0.8434920634920635


Optimization Progress:  67%|██████▋   | 80/120 [05:41<15:47, 23.70s/pipeline]

Generation 3 - Current best internal CV score: 0.846984126984127


Optimization Progress:  83%|████████▎ | 100/120 [05:53<03:47, 11.40s/pipeline]

Generation 4 - Current best internal CV score: 0.8482539682539683


                                                                              

Generation 5 - Current best internal CV score: 0.8482539682539683

Best pipeline: LogisticRegression(input_matrix, C=1.0, dual=True, penalty=l2)
0.8260869565217391


Optimization Progress:  33%|███▎      | 40/120 [00:32<15:57, 11.97s/pipeline]

Generation 1 - Current best internal CV score: 0.8333333333333334


Optimization Progress:  50%|█████     | 60/120 [01:03<14:28, 14.47s/pipeline]

Generation 2 - Current best internal CV score: 0.8333333333333334


Optimization Progress:  67%|██████▋   | 80/120 [03:22<32:28, 48.71s/pipeline]

Generation 3 - Current best internal CV score: 0.8333333333333334


Optimization Progress:  84%|████████▍ | 101/120 [08:26<36:18, 114.64s/pipeline] 

Generation 4 - Current best internal CV score: 0.834920634920635


                                                                               

Generation 5 - Current best internal CV score: 0.834920634920635

Best pipeline: GaussianNB(RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.05, min_samples_leaf=11, min_samples_split=3, n_estimators=100))
0.8611111111111112


In [6]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 24 (of 24 )
Accuary:  100.0
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 31 (of 36 )
Accuary:  86.11111111111111
Number of matches: 29 (of 36 )
Accuary:  80.55555555555556
Number of matches: 32 (of 36 )
Accuary:  88.88888888888889
Number of matches: 33 (of 36 )
Accuary:  91.66666666666666
Number of matches: 32 (of 36 )
Accuary:  88.88888888888889
Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 33 (of 36 )
Accuary:  91.66666666666666
Number of

  if diff:
  f = msb / msw
  if diff:


Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 40 (of 48 )
Accuary:  83.33333333333334
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 39 (of 48 )
Accuary:  81.25
Number of matches: 44 (of 48 )
Accuary:  91.66666666666666
Number of matches: 41 (of 48 )
Accuary:  85.41666666666666
Number of matches: 44 (of 48 )
Accuary:  91.66666666666666
Number of matches: 42 (of 48 )
Accuary:  87.5
Number of matches: 41 (of 48 )
Accuary:  85.41666666666666
Number of matches: 55 (of 72 )
Accuary:  76.38888888888889
Number of matches: 60 (of 72 )
Accuary:  83.33333333333334


  if diff:


Number of matches: 58 (of 72 )
Accuary:  80.55555555555556
Number of matches: 56 (of 72 )
Accuary:  77.77777777777779


  if diff:
  if diff:


Number of matches: 58 (of 72 )
Accuary:  80.55555555555556
Number of matches: 60 (of 72 )
Accuary:  83.33333333333334
Number of matches: 58 (of 69 )
Accuary:  84.05797101449275
Number of matches: 56 (of 69 )
Accuary:  81.15942028985508
Number of matches: 57 (of 69 )
Accuary:  82.6086956521739
Number of matches: 61 (of 72 )
Accuary:  84.72222222222221


In [7]:
print(len(unmatched))
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

254
Comprehension/Rest 25
Comprehension/Syntax 143
Rest/Syntax 86


In [8]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [9]:
list1, list2, list3, list4 = split_list(collectAccuracies,4)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_deact','3_groups_deact','4_groups_deact','6_groups_deact']

collectedAccuracies.to_csv('triangulate_Deact.csv',sep=';',decimal='.', index=False)