In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('data_task_groups_roi_deact.csv',sep=';',decimal='.')
z_data.drop(list(z_data.filter(regex = '_deact')),axis = 1, inplace = True)

#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)
   
# recode labels to numeric values so t_pot can handle them
# alphabetically: 0: C, 1: R, 2, S
z_data['task'] = z_data['task'].replace({'C': 0, 'R': 1, 'S': 2})

numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 12 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [3]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [4]:
# assumes that the column names of the structure xxx_number_xxx, e.g., aggr_13_groups0
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high



In [5]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 40
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'ROIs/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'

    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [00:06<01:47,  1.35s/pipeline]

Generation 1 - Current best internal CV score: 0.8200000000000001


Optimization Progress:  50%|█████     | 60/120 [00:15<01:30,  1.50s/pipeline]

Generation 2 - Current best internal CV score: 0.8766666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:55<08:16, 12.42s/pipeline]

Generation 3 - Current best internal CV score: 0.8766666666666667


Optimization Progress:  83%|████████▎ | 100/120 [01:01<01:16,  3.82s/pipeline]

Generation 4 - Current best internal CV score: 0.8766666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.8766666666666667

Best pipeline: LinearSVC(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.45, min_samples_leaf=7, min_samples_split=7, n_estimators=100), C=10.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.01)
0.8333333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:16<3:16:57, 149.59s/pipeline]

Generation 1 - Current best internal CV score: 0.8266666666666665


Optimization Progress:  51%|█████     | 61/120 [05:27<1:14:29, 75.76s/pipeline] 

Generation 2 - Current best internal CV score: 0.8266666666666665


Optimization Progress:  68%|██████▊   | 81/120 [07:11<44:10, 67.97s/pipeline]  

Generation 3 - Current best internal CV score: 0.8266666666666665


Optimization Progress:  84%|████████▍ | 101/120 [07:16<10:48, 34.13s/pipeline]

Generation 4 - Current best internal CV score: 0.8266666666666665


                                                                              

Generation 5 - Current best internal CV score: 0.8300000000000001

Best pipeline: LogisticRegression(StandardScaler(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.4, min_samples_leaf=10, min_samples_split=8, n_estimators=100)), C=25.0, dual=True, penalty=l2)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:06<02:01,  1.51s/pipeline]

Generation 1 - Current best internal CV score: 0.8033333333333333


Optimization Progress:  50%|█████     | 60/120 [00:10<00:45,  1.32pipeline/s]

Generation 2 - Current best internal CV score: 0.8033333333333333


Optimization Progress:  67%|██████▋   | 80/120 [00:23<00:36,  1.11pipeline/s]

Generation 3 - Current best internal CV score: 0.8100000000000002


Optimization Progress:  83%|████████▎ | 100/120 [00:27<00:12,  1.59pipeline/s]

Generation 4 - Current best internal CV score: 0.8166666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.8166666666666667

Best pipeline: LogisticRegression(RobustScaler(Normalizer(input_matrix, norm=l1)), C=20.0, dual=True, penalty=l2)
0.875


Optimization Progress:  33%|███▎      | 40/120 [00:14<06:30,  4.88s/pipeline]

Generation 1 - Current best internal CV score: 0.8866666666666667


Optimization Progress:  50%|█████     | 60/120 [00:27<05:53,  5.89s/pipeline]

Generation 2 - Current best internal CV score: 0.8866666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:44<05:02,  7.56s/pipeline]

Generation 3 - Current best internal CV score: 0.8866666666666667


Optimization Progress:  83%|████████▎ | 100/120 [00:56<02:20,  7.03s/pipeline]

Generation 4 - Current best internal CV score: 0.8866666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9166666666666667

Best pipeline: RandomForestClassifier(Normalizer(input_matrix, norm=l2), bootstrap=False, criterion=gini, max_features=0.1, min_samples_leaf=5, min_samples_split=6, n_estimators=100)
0.875


Optimization Progress:  33%|███▎      | 40/120 [00:18<02:38,  1.98s/pipeline]

Generation 1 - Current best internal CV score: 0.7833333333333333


Optimization Progress:  50%|█████     | 60/120 [00:23<01:57,  1.96s/pipeline]

Generation 2 - Current best internal CV score: 0.7833333333333333


Optimization Progress:  67%|██████▋   | 80/120 [00:27<01:17,  1.94s/pipeline]

Generation 3 - Current best internal CV score: 0.7833333333333333


Optimization Progress:  83%|████████▎ | 100/120 [00:34<00:32,  1.61s/pipeline]

Generation 4 - Current best internal CV score: 0.8099999999999999


                                                                              

Generation 5 - Current best internal CV score: 0.8099999999999999

Best pipeline: GaussianNB(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.8500000000000001, min_samples_leaf=6, min_samples_split=6, n_estimators=100))
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:10<02:05,  1.56s/pipeline]

Generation 1 - Current best internal CV score: 0.76


Optimization Progress:  50%|█████     | 60/120 [01:57<07:56,  7.95s/pipeline]

Generation 2 - Current best internal CV score: 0.85


Optimization Progress:  67%|██████▋   | 80/120 [02:02<03:57,  5.95s/pipeline]

Generation 3 - Current best internal CV score: 0.85


Optimization Progress:  83%|████████▎ | 100/120 [02:07<01:09,  3.49s/pipeline]

Generation 4 - Current best internal CV score: 0.85


                                                                              

Generation 5 - Current best internal CV score: 0.85

Best pipeline: LogisticRegression(input_matrix, C=20.0, dual=True, penalty=l2)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:06<02:34,  1.94s/pipeline]

Generation 1 - Current best internal CV score: 0.7666666666666667


Optimization Progress:  50%|█████     | 60/120 [00:15<01:48,  1.81s/pipeline]

Generation 2 - Current best internal CV score: 0.7699999999999999


Optimization Progress:  67%|██████▋   | 80/120 [00:17<00:38,  1.05pipeline/s]

Generation 3 - Current best internal CV score: 0.8


Optimization Progress:  83%|████████▎ | 100/120 [00:21<00:16,  1.25pipeline/s]

Generation 4 - Current best internal CV score: 0.8066666666666666


                                                                              

Generation 5 - Current best internal CV score: 0.82

Best pipeline: LogisticRegression(input_matrix, C=1.0, dual=False, penalty=l1)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:16<06:07,  4.60s/pipeline]

Generation 1 - Current best internal CV score: 0.7933333333333333


Optimization Progress:  50%|█████     | 60/120 [02:37<30:20, 30.35s/pipeline]

Generation 2 - Current best internal CV score: 0.7933333333333333


Optimization Progress:  67%|██████▋   | 80/120 [02:42<14:27, 21.70s/pipeline]

Generation 3 - Current best internal CV score: 0.7933333333333333


Optimization Progress:  83%|████████▎ | 100/120 [02:48<03:43, 11.19s/pipeline]

Generation 4 - Current best internal CV score: 0.8099999999999999


                                                                              

Generation 5 - Current best internal CV score: 0.86

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=5, max_features=0.05, min_samples_leaf=13, min_samples_split=6, n_estimators=100, subsample=0.9000000000000001)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:54<22:38, 16.99s/pipeline]

Generation 1 - Current best internal CV score: 0.8733333333333333


Optimization Progress:  50%|█████     | 60/120 [01:18<11:48, 11.81s/pipeline]

Generation 2 - Current best internal CV score: 0.8733333333333333


Optimization Progress:  67%|██████▋   | 80/120 [03:52<19:16, 28.91s/pipeline]

Generation 3 - Current best internal CV score: 0.8733333333333333


Optimization Progress:  85%|████████▌ | 102/120 [08:55<17:51, 59.54s/pipeline]

Generation 4 - Current best internal CV score: 0.8733333333333333


                                                                              

Generation 5 - Current best internal CV score: 0.8733333333333333

Best pipeline: LinearSVC(input_matrix, C=10.0, dual=True, loss=hinge, penalty=l2, tol=0.0001)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:20<08:24,  6.31s/pipeline]

Generation 1 - Current best internal CV score: 0.8066666666666666


Optimization Progress:  50%|█████     | 60/120 [00:26<03:22,  3.38s/pipeline]

Generation 2 - Current best internal CV score: 0.8066666666666666


Optimization Progress:  67%|██████▋   | 80/120 [00:32<01:42,  2.57s/pipeline]

Generation 3 - Current best internal CV score: 0.8133333333333332


Optimization Progress:  83%|████████▎ | 100/120 [01:45<07:26, 22.32s/pipeline]

Generation 4 - Current best internal CV score: 0.8233333333333335


                                                                              

Generation 5 - Current best internal CV score: 0.8233333333333335

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=1, min_samples_split=5, n_estimators=100)
0.9583333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:06<2:00:39, 91.64s/pipeline]

Generation 1 - Current best internal CV score: 0.7892857142857143


Optimization Progress:  51%|█████     | 61/120 [05:16<46:27, 47.24s/pipeline]  

Generation 2 - Current best internal CV score: 0.7976190476190477


Optimization Progress:  68%|██████▊   | 81/120 [07:47<39:39, 61.00s/pipeline]

Generation 3 - Current best internal CV score: 0.7976190476190477


Optimization Progress:  84%|████████▍ | 101/120 [11:36<31:10, 98.47s/pipeline] 

Generation 4 - Current best internal CV score: 0.7976190476190477


                                                                              

Generation 5 - Current best internal CV score: 0.7976190476190477

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=3, max_features=0.45, min_samples_leaf=14, min_samples_split=18, n_estimators=100, subsample=0.9500000000000001)
0.9444444444444444


Optimization Progress:  33%|███▎      | 40/120 [03:18<22:44, 17.06s/pipeline]  

Generation 1 - Current best internal CV score: 0.7773809523809524


Optimization Progress:  51%|█████     | 61/120 [08:21<41:30, 42.22s/pipeline]

Generation 2 - Current best internal CV score: 0.786904761904762


Optimization Progress:  68%|██████▊   | 81/120 [10:44<40:59, 63.05s/pipeline]  

Generation 3 - Current best internal CV score: 0.786904761904762


Optimization Progress:  85%|████████▌ | 102/120 [15:47<22:48, 76.01s/pipeline]

Generation 4 - Current best internal CV score: 0.8035714285714286


                                                                              

Generation 5 - Current best internal CV score: 0.8035714285714286

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:55<21:16, 15.96s/pipeline]

Generation 1 - Current best internal CV score: 0.8071428571428572


Optimization Progress:  50%|█████     | 60/120 [01:24<12:11, 12.20s/pipeline]

Generation 2 - Current best internal CV score: 0.8071428571428572


Optimization Progress:  67%|██████▋   | 80/120 [03:11<25:07, 37.69s/pipeline]

Generation 3 - Current best internal CV score: 0.8178571428571428


Optimization Progress:  83%|████████▎ | 100/120 [03:16<06:31, 19.59s/pipeline]

Generation 4 - Current best internal CV score: 0.8178571428571428


                                                                              

Generation 5 - Current best internal CV score: 0.8178571428571428

Best pipeline: LogisticRegression(LogisticRegression(input_matrix, C=0.01, dual=True, penalty=l2), C=0.1, dual=True, penalty=l2)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:13<04:15,  3.19s/pipeline]

Generation 1 - Current best internal CV score: 0.8071428571428572


Optimization Progress:  50%|█████     | 60/120 [00:58<10:54, 10.91s/pipeline]

Generation 2 - Current best internal CV score: 0.8178571428571428


Optimization Progress:  67%|██████▋   | 80/120 [01:45<14:29, 21.73s/pipeline]

Generation 3 - Current best internal CV score: 0.8178571428571428


Optimization Progress:  83%|████████▎ | 100/120 [02:19<05:33, 16.70s/pipeline]

Generation 4 - Current best internal CV score: 0.8178571428571428


                                                                              

Generation 5 - Current best internal CV score: 0.8178571428571428

Best pipeline: ExtraTreesClassifier(ZeroCount(input_matrix), bootstrap=False, criterion=gini, max_features=0.8, min_samples_leaf=4, min_samples_split=18, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:44<13:52, 10.41s/pipeline]

Generation 1 - Current best internal CV score: 0.8142857142857143


Optimization Progress:  50%|█████     | 60/120 [02:55<41:25, 41.43s/pipeline]

Generation 2 - Current best internal CV score: 0.8142857142857143


Optimization Progress:  67%|██████▋   | 80/120 [03:02<14:41, 22.04s/pipeline]

Generation 3 - Current best internal CV score: 0.8142857142857143


Optimization Progress:  83%|████████▎ | 100/120 [03:09<03:54, 11.72s/pipeline]

Generation 4 - Current best internal CV score: 0.8142857142857143


                                                                              

Generation 5 - Current best internal CV score: 0.8142857142857143

Best pipeline: LinearSVC(RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.2, min_samples_leaf=13, min_samples_split=12, n_estimators=100), C=0.1, dual=True, loss=hinge, penalty=l2, tol=1e-05)
0.75


Optimization Progress:  33%|███▎      | 40/120 [03:16<1:20:48, 60.61s/pipeline]

Generation 1 - Current best internal CV score: 0.7678571428571428


Optimization Progress:  50%|█████     | 60/120 [03:31<33:57, 33.96s/pipeline]  

Generation 2 - Current best internal CV score: 0.8178571428571428


Optimization Progress:  67%|██████▋   | 80/120 [04:05<17:51, 26.78s/pipeline]

Generation 3 - Current best internal CV score: 0.8178571428571428


Optimization Progress:  84%|████████▍ | 101/120 [09:09<18:28, 58.34s/pipeline]

Generation 4 - Current best internal CV score: 0.8178571428571428


                                                                              

Generation 5 - Current best internal CV score: 0.8178571428571428

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=25.0, dual=False, penalty=l1)
0.8611111111111112


Optimization Progress:  33%|███▎      | 40/120 [01:14<19:17, 14.47s/pipeline]

Generation 1 - Current best internal CV score: 0.8273809523809523


Optimization Progress:  50%|█████     | 60/120 [01:23<07:14,  7.24s/pipeline]

Generation 2 - Current best internal CV score: 0.8452380952380952


Optimization Progress:  67%|██████▋   | 80/120 [02:06<10:32, 15.82s/pipeline]

Generation 3 - Current best internal CV score: 0.8452380952380952


Optimization Progress:  83%|████████▎ | 100/120 [02:22<02:59,  8.96s/pipeline]

Generation 4 - Current best internal CV score: 0.8452380952380952


                                                                              

Generation 5 - Current best internal CV score: 0.8452380952380952

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.05, min_samples_leaf=3, min_samples_split=6, n_estimators=100)
0.8333333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:42<17:55, 13.44s/pipeline]

Generation 1 - Current best internal CV score: 0.7964285714285715


Optimization Progress:  50%|█████     | 60/120 [01:50<26:37, 26.63s/pipeline]

Generation 2 - Current best internal CV score: 0.805952380952381


Optimization Progress:  68%|██████▊   | 81/120 [06:55<1:07:49, 104.36s/pipeline]

Generation 3 - Current best internal CV score: 0.805952380952381


Optimization Progress:  84%|████████▍ | 101/120 [07:04<11:58, 37.82s/pipeline]  

Generation 4 - Current best internal CV score: 0.805952380952381


                                                                              

Generation 5 - Current best internal CV score: 0.805952380952381

Best pipeline: GradientBoostingClassifier(KNeighborsClassifier(input_matrix, n_neighbors=45, p=1, weights=uniform), learning_rate=0.5, max_depth=8, max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=2, n_estimators=100, subsample=0.6500000000000001)
0.7777777777777778


Optimization Progress:  37%|███▋      | 44/120 [10:03<3:28:06, 164.29s/pipeline]

Generation 1 - Current best internal CV score: 0.769047619047619


Optimization Progress:  53%|█████▎    | 64/120 [13:48<1:14:35, 79.92s/pipeline] 

Generation 2 - Current best internal CV score: 0.7761904761904762


Optimization Progress:  71%|███████   | 85/120 [26:20<1:16:28, 131.11s/pipeline]

Generation 3 - Current best internal CV score: 0.7761904761904762


Optimization Progress:  88%|████████▊ | 105/120 [26:34<12:15, 49.04s/pipeline]  

Generation 4 - Current best internal CV score: 0.7761904761904762


                                                                              

Generation 5 - Current best internal CV score: 0.7892857142857143

Best pipeline: LogisticRegression(XGBClassifier(input_matrix, learning_rate=0.01, max_depth=1, min_child_weight=9, n_estimators=100, nthread=1, subsample=0.9000000000000001), C=0.5, dual=False, penalty=l2)


  if diff:


0.8055555555555556


Optimization Progress:  33%|███▎      | 40/120 [00:54<20:23, 15.30s/pipeline]

Generation 1 - Current best internal CV score: 0.7952380952380953


Optimization Progress:  50%|█████     | 60/120 [03:12<48:26, 48.44s/pipeline]  

Generation 2 - Current best internal CV score: 0.8142857142857143


Optimization Progress:  67%|██████▋   | 80/120 [03:20<16:47, 25.20s/pipeline]

Generation 3 - Current best internal CV score: 0.8142857142857143


Optimization Progress:  83%|████████▎ | 100/120 [03:26<04:32, 13.60s/pipeline]

Generation 4 - Current best internal CV score: 0.8154761904761904


                                                                              

Generation 5 - Current best internal CV score: 0.8154761904761904

Best pipeline: LogisticRegression(GaussianNB(CombineDFs(input_matrix, input_matrix)), C=15.0, dual=True, penalty=l2)
0.8055555555555556


Optimization Progress:  33%|███▎      | 40/120 [01:16<20:07, 15.09s/pipeline]

Generation 1 - Current best internal CV score: 0.7785185185185185


Optimization Progress:  50%|█████     | 60/120 [01:39<09:48,  9.80s/pipeline]

Generation 2 - Current best internal CV score: 0.7785185185185185


Optimization Progress:  67%|██████▋   | 80/120 [02:19<07:14, 10.87s/pipeline]

Generation 3 - Current best internal CV score: 0.7792592592592593


Optimization Progress:  83%|████████▎ | 100/120 [03:09<02:58,  8.91s/pipeline]

Generation 4 - Current best internal CV score: 0.7792592592592593


                                                                              

Generation 5 - Current best internal CV score: 0.7859259259259259

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.5, min_samples_leaf=4, min_samples_split=10, n_estimators=100)
0.7916666666666666


Optimization Progress:  34%|███▍      | 41/120 [05:09<3:15:24, 148.42s/pipeline]

Generation 1 - Current best internal CV score: 0.7859259259259259


Optimization Progress:  51%|█████     | 61/120 [05:13<1:11:52, 73.10s/pipeline] 

Generation 2 - Current best internal CV score: 0.7859259259259259


Optimization Progress:  68%|██████▊   | 82/120 [10:14<1:19:45, 125.94s/pipeline]

Generation 3 - Current best internal CV score: 0.7918518518518519


Optimization Progress:  85%|████████▌ | 102/120 [10:28<19:43, 65.75s/pipeline]  

Generation 4 - Current best internal CV score: 0.7933333333333333


                                                                              

Generation 5 - Current best internal CV score: 0.8118518518518518

Best pipeline: ExtraTreesClassifier(RobustScaler(input_matrix), bootstrap=False, criterion=entropy, max_features=0.7500000000000001, min_samples_leaf=8, min_samples_split=7, n_estimators=100)
0.8125


Optimization Progress:  33%|███▎      | 40/120 [00:13<03:13,  2.41s/pipeline]

Generation 1 - Current best internal CV score: 0.7844444444444444


Optimization Progress:  50%|█████     | 60/120 [00:19<01:53,  1.90s/pipeline]

Generation 2 - Current best internal CV score: 0.7844444444444444


Optimization Progress:  67%|██████▋   | 80/120 [01:01<04:41,  7.05s/pipeline]

Generation 3 - Current best internal CV score: 0.7844444444444444


Optimization Progress:  84%|████████▍ | 101/120 [06:04<29:30, 93.19s/pipeline]

Generation 4 - Current best internal CV score: 0.7844444444444444


                                                                              

Generation 5 - Current best internal CV score: 0.7844444444444444

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=3, max_features=0.6000000000000001, min_samples_leaf=15, min_samples_split=6, n_estimators=100, subsample=0.4)
0.7916666666666666


Optimization Progress:  34%|███▍      | 41/120 [05:11<2:05:31, 95.34s/pipeline]

Generation 1 - Current best internal CV score: 0.7696296296296297


Optimization Progress:  51%|█████     | 61/120 [05:22<33:33, 34.13s/pipeline]  

Generation 2 - Current best internal CV score: 0.7733333333333334


Optimization Progress:  68%|██████▊   | 81/120 [05:35<12:52, 19.81s/pipeline]

Generation 3 - Current best internal CV score: 0.7844444444444445


Optimization Progress:  84%|████████▍ | 101/120 [05:41<03:14, 10.25s/pipeline]

Generation 4 - Current best internal CV score: 0.7851851851851853


                                                                              

Generation 5 - Current best internal CV score: 0.7859259259259259

Best pipeline: ExtraTreesClassifier(LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l1), bootstrap=True, criterion=entropy, max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=7, n_estimators=100)
0.7291666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:23<04:53,  3.67s/pipeline]

Generation 1 - Current best internal CV score: 0.7362962962962963


Optimization Progress:  50%|█████     | 60/120 [00:45<05:06,  5.11s/pipeline]

Generation 2 - Current best internal CV score: 0.774074074074074


Optimization Progress:  67%|██████▋   | 80/120 [01:00<03:46,  5.67s/pipeline]

Generation 3 - Current best internal CV score: 0.774074074074074


Optimization Progress:  83%|████████▎ | 100/120 [01:30<03:37, 10.87s/pipeline]

Generation 4 - Current best internal CV score: 0.774074074074074


                                                                              

Generation 5 - Current best internal CV score: 0.774074074074074

Best pipeline: LogisticRegression(Normalizer(input_matrix, norm=l2), C=5.0, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  34%|███▍      | 41/120 [08:52<2:13:33, 101.43s/pipeline]

Generation 1 - Current best internal CV score: 0.7718518518518518


Optimization Progress:  51%|█████     | 61/120 [12:23<1:09:31, 70.71s/pipeline] 

Generation 2 - Current best internal CV score: 0.7785185185185186


Optimization Progress:  68%|██████▊   | 81/120 [13:15<25:38, 39.46s/pipeline]  

Generation 3 - Current best internal CV score: 0.7918518518518519


Optimization Progress:  84%|████████▍ | 101/120 [16:56<26:59, 85.24s/pipeline]

Generation 4 - Current best internal CV score: 0.7918518518518519


                                                                              

Generation 5 - Current best internal CV score: 0.7918518518518519

Best pipeline: RandomForestClassifier(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.8, min_samples_leaf=6, min_samples_split=6, n_estimators=100), bootstrap=True, criterion=entropy, max_features=0.05, min_samples_leaf=19, min_samples_split=6, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:40<11:29,  8.61s/pipeline]

Generation 1 - Current best internal CV score: 0.8022222222222222


Optimization Progress:  50%|█████     | 60/120 [00:56<06:33,  6.57s/pipeline]

Generation 2 - Current best internal CV score: 0.8022222222222222


Optimization Progress:  67%|██████▋   | 80/120 [02:21<18:57, 28.43s/pipeline]

Generation 3 - Current best internal CV score: 0.8022222222222222


Optimization Progress:  83%|████████▎ | 100/120 [02:29<04:57, 14.86s/pipeline]

Generation 4 - Current best internal CV score: 0.8022222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.8022222222222222

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.7000000000000001, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
0.7708333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:04<25:30, 19.14s/pipeline]

Generation 1 - Current best internal CV score: 0.7585185185185186


Optimization Progress:  50%|█████     | 60/120 [01:37<18:53, 18.89s/pipeline]

Generation 2 - Current best internal CV score: 0.7585185185185186


Optimization Progress:  67%|██████▋   | 80/120 [03:32<11:51, 17.79s/pipeline]

Generation 3 - Current best internal CV score: 0.7585185185185186


Optimization Progress:  83%|████████▎ | 100/120 [04:10<05:47, 17.39s/pipeline]

Generation 4 - Current best internal CV score: 0.7851851851851852


                                                                              

Generation 5 - Current best internal CV score: 0.7925925925925925

Best pipeline: LogisticRegression(MaxAbsScaler(input_matrix), C=0.5, dual=False, penalty=l1)
0.9166666666666666


Optimization Progress:  33%|███▎      | 40/120 [04:48<1:41:04, 75.81s/pipeline]

Generation 1 - Current best internal CV score: 0.7874074074074074


Optimization Progress:  50%|█████     | 60/120 [05:18<46:03, 46.07s/pipeline]  

Generation 2 - Current best internal CV score: 0.7874074074074074


Optimization Progress:  67%|██████▋   | 80/120 [05:51<21:31, 32.28s/pipeline]

Generation 3 - Current best internal CV score: 0.7874074074074074


Optimization Progress:  83%|████████▎ | 100/120 [06:27<08:48, 26.43s/pipeline]

Generation 4 - Current best internal CV score: 0.7874074074074074


                                                                              

Generation 5 - Current best internal CV score: 0.7874074074074074

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.55, min_samples_leaf=4, min_samples_split=16, n_estimators=100)
0.7291666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:18<05:09,  3.87s/pipeline]

Generation 1 - Current best internal CV score: 0.8066666666666669


Optimization Progress:  50%|█████     | 60/120 [02:09<34:56, 34.95s/pipeline]

Generation 2 - Current best internal CV score: 0.8066666666666669


Optimization Progress:  67%|██████▋   | 80/120 [02:13<11:30, 17.26s/pipeline]

Generation 3 - Current best internal CV score: 0.8066666666666669


Optimization Progress:  83%|████████▎ | 100/120 [02:17<03:02,  9.11s/pipeline]

Generation 4 - Current best internal CV score: 0.8066666666666669


                                                                              

Generation 5 - Current best internal CV score: 0.8066666666666669

Best pipeline: ExtraTreesClassifier(Normalizer(input_matrix, norm=l1), bootstrap=False, criterion=gini, max_features=0.55, min_samples_leaf=14, min_samples_split=2, n_estimators=100)
0.7708333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:35<07:28,  5.61s/pipeline]

Generation 1 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  50%|█████     | 60/120 [00:47<05:13,  5.22s/pipeline]

Generation 2 - Current best internal CV score: 0.7507936507936508


Optimization Progress:  67%|██████▋   | 80/120 [00:58<02:11,  3.29s/pipeline]

Generation 3 - Current best internal CV score: 0.7542857142857142


Optimization Progress:  83%|████████▎ | 100/120 [01:11<01:26,  4.34s/pipeline]

Generation 4 - Current best internal CV score: 0.7688888888888888


                                                                              

Generation 5 - Current best internal CV score: 0.7688888888888888

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.25, min_samples_leaf=4, min_samples_split=6, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [02:11<1:10:59, 53.24s/pipeline]

Generation 1 - Current best internal CV score: 0.731111111111111


Optimization Progress:  50%|█████     | 60/120 [03:02<25:52, 25.87s/pipeline]  

Generation 2 - Current best internal CV score: 0.731111111111111


Optimization Progress:  67%|██████▋   | 80/120 [06:38<22:42, 34.07s/pipeline]

Generation 3 - Current best internal CV score: 0.731111111111111


Optimization Progress:  83%|████████▎ | 100/120 [06:57<05:43, 17.18s/pipeline]

Generation 4 - Current best internal CV score: 0.7311111111111112


                                                                              

Generation 5 - Current best internal CV score: 0.7317460317460318

Best pipeline: GradientBoostingClassifier(ZeroCount(input_matrix), learning_rate=0.5, max_depth=7, max_features=0.6000000000000001, min_samples_leaf=8, min_samples_split=20, n_estimators=100, subsample=0.9000000000000001)
0.6944444444444444


Optimization Progress:  33%|███▎      | 40/120 [01:03<29:40, 22.25s/pipeline]

Generation 1 - Current best internal CV score: 0.7342857142857142


Optimization Progress:  50%|█████     | 60/120 [01:21<16:11, 16.18s/pipeline]

Generation 2 - Current best internal CV score: 0.7342857142857142


Optimization Progress:  68%|██████▊   | 81/120 [06:24<1:04:03, 98.55s/pipeline]

Generation 3 - Current best internal CV score: 0.7342857142857142


Optimization Progress:  84%|████████▍ | 101/120 [07:02<18:51, 59.55s/pipeline] 

Generation 4 - Current best internal CV score: 0.7342857142857142


                                                                               

Generation 5 - Current best internal CV score: 0.7342857142857142

Best pipeline: GradientBoostingClassifier(RFE(input_matrix, criterion=gini, max_features=0.9500000000000001, n_estimators=100, step=0.5), learning_rate=0.001, max_depth=10, max_features=0.35000000000000003, min_samples_leaf=16, min_samples_split=15, n_estimators=100, subsample=1.0)
0.7361111111111112


Optimization Progress:  33%|███▎      | 40/120 [03:02<52:43, 39.54s/pipeline]  

Generation 1 - Current best internal CV score: 0.6904761904761905


Optimization Progress:  50%|█████     | 60/120 [03:49<23:07, 23.12s/pipeline]

Generation 2 - Current best internal CV score: 0.7187301587301589


Optimization Progress:  67%|██████▋   | 80/120 [03:56<11:10, 16.76s/pipeline]

Generation 3 - Current best internal CV score: 0.7187301587301589


Optimization Progress:  83%|████████▎ | 100/120 [04:13<03:32, 10.60s/pipeline]

Generation 4 - Current best internal CV score: 0.7187301587301589


                                                                              

Generation 5 - Current best internal CV score: 0.7187301587301589

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=1.0, dual=True, penalty=l2)
0.6944444444444444


Optimization Progress:  33%|███▎      | 40/120 [00:31<07:48,  5.85s/pipeline]

Generation 1 - Current best internal CV score: 0.741904761904762


Optimization Progress:  50%|█████     | 60/120 [01:05<13:08, 13.14s/pipeline]

Generation 2 - Current best internal CV score: 0.741904761904762


Optimization Progress:  67%|██████▋   | 80/120 [01:12<04:55,  7.40s/pipeline]

Generation 3 - Current best internal CV score: 0.741904761904762


Optimization Progress:  83%|████████▎ | 100/120 [02:06<06:20, 19.04s/pipeline]

Generation 4 - Current best internal CV score: 0.7453968253968254


                                                                              

Generation 5 - Current best internal CV score: 0.7453968253968254

Best pipeline: RandomForestClassifier(LinearSVC(input_matrix, C=0.001, dual=True, loss=squared_hinge, penalty=l2, tol=1e-05), bootstrap=True, criterion=gini, max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=18, n_estimators=100)
0.7638888888888888


Optimization Progress:  33%|███▎      | 40/120 [00:20<02:00,  1.50s/pipeline]

Generation 1 - Current best internal CV score: 0.7273015873015873


Optimization Progress:  50%|█████     | 60/120 [00:31<01:45,  1.75s/pipeline]

Generation 2 - Current best internal CV score: 0.7273015873015873


Optimization Progress:  67%|██████▋   | 80/120 [00:43<01:43,  2.58s/pipeline]

Generation 3 - Current best internal CV score: 0.7692063492063492


Optimization Progress:  83%|████████▎ | 100/120 [00:56<00:42,  2.12s/pipeline]

Generation 4 - Current best internal CV score: 0.8050793650793651


                                                                              

Generation 5 - Current best internal CV score: 0.8146031746031746

Best pipeline: LinearSVC(ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=7, n_estimators=100), C=10.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.01)
0.75


Optimization Progress:  34%|███▍      | 41/120 [06:20<3:25:00, 155.70s/pipeline]

Generation 1 - Current best internal CV score: 0.7733333333333332


Optimization Progress:  51%|█████     | 61/120 [06:44<1:22:03, 83.44s/pipeline] 

Generation 2 - Current best internal CV score: 0.7733333333333332


Optimization Progress:  68%|██████▊   | 81/120 [07:26<30:37, 47.10s/pipeline]  

Generation 3 - Current best internal CV score: 0.7733333333333332


Optimization Progress:  85%|████████▌ | 102/120 [12:30<23:55, 79.73s/pipeline]  

Generation 4 - Current best internal CV score: 0.7733333333333332


                                                                               

Generation 5 - Current best internal CV score: 0.7784126984126984

Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=True, criterion=gini, max_features=0.35000000000000003, min_samples_leaf=2, min_samples_split=15, n_estimators=100)
0.7638888888888888


Optimization Progress:  33%|███▎      | 40/120 [01:11<34:01, 25.51s/pipeline] 

Generation 1 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  50%|█████     | 60/120 [01:30<18:13, 18.23s/pipeline]

Generation 2 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  67%|██████▋   | 80/120 [01:40<07:37, 11.43s/pipeline]

Generation 3 - Current best internal CV score: 0.753968253968254


Optimization Progress:  83%|████████▎ | 100/120 [01:59<03:39, 10.95s/pipeline]

Generation 4 - Current best internal CV score: 0.753968253968254


                                                                              

Generation 5 - Current best internal CV score: 0.76

Best pipeline: RandomForestClassifier(SelectPercentile(GradientBoostingClassifier(input_matrix, learning_rate=0.01, max_depth=5, max_features=0.8500000000000001, min_samples_leaf=5, min_samples_split=7, n_estimators=100, subsample=0.55), percentile=13), bootstrap=False, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=14, min_samples_split=12, n_estimators=100)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:01<12:41,  9.52s/pipeline]

Generation 1 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  50%|█████     | 60/120 [01:07<03:59,  4.00s/pipeline]

Generation 2 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  67%|██████▋   | 80/120 [01:51<06:36,  9.90s/pipeline]

Generation 3 - Current best internal CV score: 0.7412698412698413


Optimization Progress:  83%|████████▎ | 100/120 [02:06<02:40,  8.03s/pipeline]

Generation 4 - Current best internal CV score: 0.7504761904761905


                                                                              

Generation 5 - Current best internal CV score: 0.7631746031746032

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.3, min_samples_leaf=2, min_samples_split=18, n_estimators=100)
0.7361111111111112


Optimization Progress:  34%|███▍      | 41/120 [05:09<3:15:05, 148.18s/pipeline]

Generation 1 - Current best internal CV score: 0.7333333333333334


Optimization Progress:  51%|█████     | 61/120 [05:29<1:13:14, 74.49s/pipeline] 

Generation 2 - Current best internal CV score: 0.7368253968253968


Optimization Progress:  68%|██████▊   | 81/120 [05:57<28:52, 44.42s/pipeline]  

Generation 3 - Current best internal CV score: 0.7514285714285714


Optimization Progress:  84%|████████▍ | 101/120 [06:12<07:38, 24.11s/pipeline]

Generation 4 - Current best internal CV score: 0.7514285714285714


                                                                              

Generation 5 - Current best internal CV score: 0.7514285714285714

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.01, max_depth=9, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.5)
0.7638888888888888


  if diff:


In [6]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)

    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 20 (of 24 )
Accuary:  83.33333333333334
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 21 (of 24 )
Accuary:  87.5
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 22 (of 24 )
Accuary:  91.66666666666666
Number of matches: 23 (of 24 )
Accuary:  95.83333333333334
Number of matches: 34 (of 36 )
Accuary:  94.44444444444444
Number of matches: 26 (of 36 )
Accuary:  72.22222222222221
Number of matches: 30 (of 36 )
Accuary:  83.33333333333334
Number of matches: 25 (of 36 )
Accuary:  69.44444444444444
Number of matches: 31 (of 36 )
Accuary:  86.11111111111111
Number of matches: 31 (of 36 )
Accuary:  86.11111111111111
Number of matches: 30 (of 36 )
Accuary:  83.33333333333334
Number of 

  if diff:
  if diff:


Number of matches: 29 (of 36 )
Accuary:  80.55555555555556
Number of matches: 29 (of 36 )
Accuary:  80.55555555555556
Number of matches: 41 (of 48 )
Accuary:  85.41666666666666
Number of matches: 39 (of 48 )
Accuary:  81.25
Number of matches: 35 (of 48 )
Accuary:  72.91666666666666
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 37 (of 48 )
Accuary:  77.08333333333334
Number of matches: 37 (of 48 )
Accuary:  77.08333333333334
Number of matches: 44 (of 48 )
Accuary:  91.66666666666666
Number of matches: 37 (of 48 )
Accuary:  77.08333333333334
Number of matches: 34 (of 48 )
Accuary:  70.83333333333334
Number of matches: 50 (of 72 )
Accuary:  69.44444444444444
Number of matches: 52 (of 72 )
Accuary:  72.22222222222221
Number of matches: 54 (of 72 )
Accuary:  75.0
Number of matches: 50 (of 72 )
Accuary:  69.44444444444444
Number of matches: 55 (of 72 )
Accuary:  76.38888888888889
Number of matches: 54 

  if diff:


In [7]:
len(unmatched)

382

In [8]:
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

Comprehension/Rest 45
Comprehension/Syntax 162
Rest/Syntax 175


In [9]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [10]:
list1, list2, list3, list4 = split_list(collectAccuracies,4)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_act','3_groups_act','4_groups_act','6_groups_act']

collectedAccuracies.to_csv('triangulate_Act.csv',sep=';',decimal='.', index=False)