In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('esem_act_deact_zscore_groups.csv',sep=';',decimal='.')
z_data.drop(list(z_data.filter(regex = '_deact')),axis = 1, inplace = True)

#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)


numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 13 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [3]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [4]:
# assumes that the column names of the structure xxx_number_xxx, e.g., aggr_13_groups0
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high



In [5]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 4
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'Act/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'

    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [00:18<07:04,  5.31s/pipeline]

Generation 1 - Current best internal CV score: 0.8622222222222223


Optimization Progress:  51%|█████     | 61/120 [05:21<1:31:01, 92.56s/pipeline]

Generation 2 - Current best internal CV score: 0.8822222222222222


Optimization Progress:  68%|██████▊   | 81/120 [06:17<40:08, 61.75s/pipeline]  

Generation 3 - Current best internal CV score: 0.8822222222222222


Optimization Progress:  84%|████████▍ | 101/120 [07:27<16:03, 50.72s/pipeline]

Generation 4 - Current best internal CV score: 0.8822222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.9488888888888889

Best pipeline: LinearSVC(input_matrix, C=0.5, dual=True, loss=hinge, penalty=l2, tol=0.001)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:38<16:28, 12.36s/pipeline] 

Generation 1 - Current best internal CV score: 0.868888888888889


Optimization Progress:  50%|█████     | 60/120 [00:43<06:40,  6.67s/pipeline]

Generation 2 - Current best internal CV score: 0.868888888888889


Optimization Progress:  67%|██████▋   | 80/120 [00:51<02:05,  3.14s/pipeline]

Generation 3 - Current best internal CV score: 0.8977777777777778


Optimization Progress:  83%|████████▎ | 100/120 [01:44<03:22, 10.13s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: RandomForestClassifier(RandomForestClassifier(GaussianNB(LinearSVC(XGBClassifier(input_matrix, learning_rate=0.001, max_depth=3, min_child_weight=17, n_estimators=100, nthread=1, subsample=0.4), C=15.0, dual=True, loss=hinge, penalty=l2, tol=1e-05)), bootstrap=False, criterion=gini, max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=13, n_estimators=100), bootstrap=False, criterion=gini, max_features=0.1, min_samples_leaf=11, min_samples_split=10, n_estimators=100)


  if diff:


0.8333333333333334


Optimization Progress:  35%|███▌      | 42/120 [10:03<5:09:23, 238.00s/pipeline]

Generation 1 - Current best internal CV score: 0.9355555555555555


Optimization Progress:  52%|█████▏    | 62/120 [10:14<1:54:12, 118.14s/pipeline]

Generation 2 - Current best internal CV score: 0.9355555555555555


Optimization Progress:  68%|██████▊   | 82/120 [10:28<27:20, 43.16s/pipeline]   

Generation 3 - Current best internal CV score: 0.9355555555555555


Optimization Progress:  85%|████████▌ | 102/120 [12:36<20:35, 68.62s/pipeline]

Generation 4 - Current best internal CV score: 0.96


                                                                              

Generation 5 - Current best internal CV score: 0.96

Best pipeline: LogisticRegression(MinMaxScaler(input_matrix), C=25.0, dual=True, penalty=l2)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:56<46:04, 34.56s/pipeline] 

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [03:56<28:54, 28.91s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [05:01<15:55, 23.88s/pipeline]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [05:21<04:52, 14.64s/pipeline]

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: LogisticRegression(input_matrix, C=0.5, dual=True, penalty=l2)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:22<06:06,  4.58s/pipeline]

Generation 1 - Current best internal CV score: 0.9488888888888889


Optimization Progress:  50%|█████     | 60/120 [00:29<03:09,  3.15s/pipeline]

Generation 2 - Current best internal CV score: 0.9488888888888889


Optimization Progress:  67%|██████▋   | 80/120 [00:38<01:34,  2.37s/pipeline]

Generation 3 - Current best internal CV score: 0.9488888888888889


Optimization Progress:  83%|████████▎ | 100/120 [00:46<00:43,  2.16s/pipeline]

Generation 4 - Current best internal CV score: 0.9488888888888889


                                                                              

Generation 5 - Current best internal CV score: 0.9488888888888889

Best pipeline: LogisticRegression(input_matrix, C=5.0, dual=True, penalty=l2)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:14<02:12,  1.66s/pipeline]

Generation 1 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  50%|█████     | 60/120 [00:18<01:19,  1.33s/pipeline]

Generation 2 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:22<00:36,  1.09pipeline/s]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  83%|████████▎ | 100/120 [01:08<04:45, 14.28s/pipeline]

Generation 4 - Current best internal CV score: 0.9600000000000002


                                                                              

Generation 5 - Current best internal CV score: 0.9600000000000002

Best pipeline: LinearSVC(MaxAbsScaler(input_matrix), C=0.1, dual=False, loss=squared_hinge, penalty=l2, tol=0.001)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:36<10:17,  7.71s/pipeline]

Generation 1 - Current best internal CV score: 0.8733333333333334


Optimization Progress:  50%|█████     | 60/120 [02:05<17:07, 17.13s/pipeline]

Generation 2 - Current best internal CV score: 0.8733333333333334


Optimization Progress:  67%|██████▋   | 80/120 [03:45<08:30, 12.77s/pipeline]

Generation 3 - Current best internal CV score: 0.8822222222222222


Optimization Progress:  84%|████████▍ | 101/120 [08:47<31:30, 99.49s/pipeline] 

Generation 4 - Current best internal CV score: 0.9355555555555556


                                                                              

Generation 5 - Current best internal CV score: 0.9355555555555556

Best pipeline: LogisticRegression(input_matrix, C=20.0, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:38<14:48, 11.11s/pipeline]

Generation 1 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  50%|█████     | 60/120 [02:06<31:34, 31.57s/pipeline]

Generation 2 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  67%|██████▋   | 80/120 [03:25<26:01, 39.05s/pipeline]

Generation 3 - Current best internal CV score: 0.9600000000000002


Optimization Progress:  83%|████████▎ | 100/120 [04:14<06:08, 18.45s/pipeline]

Generation 4 - Current best internal CV score: 0.9600000000000002


                                                                              

Generation 5 - Current best internal CV score: 0.9600000000000002

Best pipeline: LinearSVC(input_matrix, C=5.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:06<01:55,  1.44s/pipeline]

Generation 1 - Current best internal CV score: 0.9333333333333332


Optimization Progress:  50%|█████     | 60/120 [01:29<25:15, 25.27s/pipeline]

Generation 2 - Current best internal CV score: 0.9333333333333332


Optimization Progress:  67%|██████▋   | 80/120 [02:53<16:42, 25.07s/pipeline]

Generation 3 - Current best internal CV score: 0.9466666666666667


Optimization Progress:  84%|████████▍ | 101/120 [11:03<50:21, 159.04s/pipeline] 

Generation 4 - Current best internal CV score: 0.9466666666666667


                                                                               

Generation 5 - Current best internal CV score: 0.9466666666666667

Best pipeline: LogisticRegression(input_matrix, C=15.0, dual=False, penalty=l1)
0.7083333333333334


Optimization Progress:  34%|███▍      | 41/120 [05:52<1:28:09, 66.95s/pipeline] 

Generation 1 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  51%|█████     | 61/120 [06:50<28:27, 28.95s/pipeline]  

Generation 2 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  68%|██████▊   | 81/120 [08:19<30:34, 47.03s/pipeline]

Generation 3 - Current best internal CV score: 0.9622222222222222


Optimization Progress:  84%|████████▍ | 101/120 [09:07<08:18, 26.23s/pipeline]

Generation 4 - Current best internal CV score: 0.9622222222222222


                                                                              

Generation 5 - Current best internal CV score: 0.9622222222222222

Best pipeline: LinearSVC(input_matrix, C=10.0, dual=True, loss=squared_hinge, penalty=l2, tol=1e-05)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:27<14:42, 11.03s/pipeline]

Generation 1 - Current best internal CV score: 0.9042424242424243


Optimization Progress:  50%|█████     | 60/120 [00:39<08:42,  8.71s/pipeline]

Generation 2 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  68%|██████▊   | 82/120 [05:45<42:26, 67.01s/pipeline]  

Generation 3 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  85%|████████▌ | 102/120 [05:56<10:01, 33.40s/pipeline]

Generation 4 - Current best internal CV score: 0.9175757575757576


                                                                              

Generation 5 - Current best internal CV score: 0.9175757575757576

Best pipeline: LogisticRegression(input_matrix, C=5.0, dual=True, penalty=l2)
0.75


Optimization Progress:  35%|███▌      | 42/120 [07:19<2:25:14, 111.73s/pipeline]

Generation 1 - Current best internal CV score: 0.9224242424242425


Optimization Progress:  52%|█████▏    | 62/120 [10:05<1:40:40, 104.14s/pipeline]

Generation 2 - Current best internal CV score: 0.9224242424242425


Optimization Progress:  69%|██████▉   | 83/120 [15:07<1:01:06, 99.09s/pipeline] 

Generation 3 - Current best internal CV score: 0.9224242424242425


Optimization Progress:  86%|████████▌ | 103/120 [17:23<21:47, 76.89s/pipeline]  

Generation 4 - Current best internal CV score: 0.9224242424242425


                                                                              

Generation 5 - Current best internal CV score: 0.9224242424242425

Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  35%|███▌      | 42/120 [05:14<1:06:21, 51.05s/pipeline]

Generation 1 - Current best internal CV score: 0.7945454545454546


Optimization Progress:  52%|█████▏    | 62/120 [05:28<20:42, 21.43s/pipeline]  

Generation 2 - Current best internal CV score: 0.807878787878788


Optimization Progress:  68%|██████▊   | 82/120 [05:47<09:55, 15.68s/pipeline]

Generation 3 - Current best internal CV score: 0.8654545454545456


Optimization Progress:  85%|████████▌ | 102/120 [06:02<03:32, 11.81s/pipeline]

Generation 4 - Current best internal CV score: 0.8781818181818182


                                                                              

Generation 5 - Current best internal CV score: 0.8787878787878789

Best pipeline: LogisticRegression(ExtraTreesClassifier(GaussianNB(input_matrix), bootstrap=True, criterion=entropy, max_features=0.55, min_samples_leaf=8, min_samples_split=7, n_estimators=100), C=25.0, dual=True, penalty=l2)
0.7291666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:27<07:08,  5.36s/pipeline]

Generation 1 - Current best internal CV score: 0.9290909090909091


Optimization Progress:  50%|█████     | 60/120 [01:02<09:59,  9.99s/pipeline]

Generation 2 - Current best internal CV score: 0.9303030303030303


Optimization Progress:  68%|██████▊   | 81/120 [06:04<33:58, 52.26s/pipeline]

Generation 3 - Current best internal CV score: 0.9303030303030303


Optimization Progress:  85%|████████▌ | 102/120 [11:07<21:18, 71.05s/pipeline]

Generation 4 - Current best internal CV score: 0.9315151515151514


                                                                              

Generation 5 - Current best internal CV score: 0.9315151515151514

Best pipeline: LogisticRegression(MinMaxScaler(VarianceThreshold(input_matrix, threshold=0.1)), C=0.5, dual=True, penalty=l2)
0.6875


Optimization Progress:  34%|███▍      | 41/120 [05:17<1:14:24, 56.52s/pipeline] 

Generation 1 - Current best internal CV score: 0.8781818181818182


Optimization Progress:  51%|█████     | 61/120 [05:26<29:24, 29.90s/pipeline]  

Generation 2 - Current best internal CV score: 0.9103030303030304


Optimization Progress:  68%|██████▊   | 81/120 [07:41<35:43, 54.96s/pipeline]

Generation 3 - Current best internal CV score: 0.9163636363636364


Optimization Progress:  84%|████████▍ | 101/120 [10:14<23:05, 72.93s/pipeline]

Generation 4 - Current best internal CV score: 0.9163636363636364


                                                                              

Generation 5 - Current best internal CV score: 0.9163636363636364

Best pipeline: LinearSVC(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), C=15.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.1)
0.7708333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:48<18:19, 13.74s/pipeline]

Generation 1 - Current best internal CV score: 0.9551515151515153


Optimization Progress:  50%|█████     | 60/120 [01:00<10:16, 10.27s/pipeline]

Generation 2 - Current best internal CV score: 0.9557575757575758


Optimization Progress:  67%|██████▋   | 80/120 [01:08<04:01,  6.03s/pipeline]

Generation 3 - Current best internal CV score: 0.9557575757575758


Optimization Progress:  84%|████████▍ | 101/120 [06:09<29:31, 93.26s/pipeline] 

Generation 4 - Current best internal CV score: 0.9557575757575758


                                                                              

Generation 5 - Current best internal CV score: 0.9557575757575758

Best pipeline: LinearSVC(DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=1, min_samples_leaf=19, min_samples_split=10), C=10.0, dual=True, loss=hinge, penalty=l2, tol=0.01)
0.7708333333333334


Optimization Progress:  33%|███▎      | 40/120 [04:15<1:12:02, 54.03s/pipeline]

Generation 1 - Current best internal CV score: 0.8793939393939393


Optimization Progress:  50%|█████     | 60/120 [04:24<38:46, 38.77s/pipeline]  

Generation 2 - Current best internal CV score: 0.8793939393939393


Optimization Progress:  67%|██████▋   | 80/120 [09:04<40:34, 60.86s/pipeline]

Generation 3 - Current best internal CV score: 0.8793939393939393


Optimization Progress:  83%|████████▎ | 100/120 [09:18<10:35, 31.78s/pipeline]

Generation 4 - Current best internal CV score: 0.8793939393939393


                                                                              

Generation 5 - Current best internal CV score: 0.8793939393939395

Best pipeline: LogisticRegression(MinMaxScaler(input_matrix), C=15.0, dual=True, penalty=l2)
0.75


Optimization Progress:  33%|███▎      | 40/120 [03:00<43:35, 32.70s/pipeline]

Generation 1 - Current best internal CV score: 0.9048484848484849


Optimization Progress:  51%|█████     | 61/120 [08:02<31:46, 32.31s/pipeline]

Generation 2 - Current best internal CV score: 0.9048484848484849


Optimization Progress:  68%|██████▊   | 81/120 [09:09<21:15, 32.71s/pipeline]

Generation 3 - Current best internal CV score: 0.9054545454545455


Optimization Progress:  84%|████████▍ | 101/120 [09:18<05:26, 17.20s/pipeline]

Generation 4 - Current best internal CV score: 0.9054545454545455


                                                                              

Generation 5 - Current best internal CV score: 0.9115151515151515

Best pipeline: LogisticRegression(CombineDFs(input_matrix, BernoulliNB(input_matrix, alpha=100.0, fit_prior=False)), C=1.0, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:36<08:54,  6.68s/pipeline]

Generation 1 - Current best internal CV score: 0.8381818181818181


Optimization Progress:  51%|█████     | 61/120 [05:38<1:31:08, 92.69s/pipeline]

Generation 2 - Current best internal CV score: 0.8381818181818181


Optimization Progress:  68%|██████▊   | 82/120 [10:40<1:25:59, 135.78s/pipeline]

Generation 3 - Current best internal CV score: 0.9103030303030304


Optimization Progress:  85%|████████▌ | 102/120 [10:54<20:19, 67.75s/pipeline]  

Generation 4 - Current best internal CV score: 0.9103030303030304


                                                                              

Generation 5 - Current best internal CV score: 0.9103030303030304

Best pipeline: LinearSVC(RFE(input_matrix, criterion=gini, max_features=1.0, n_estimators=100, step=0.7500000000000001), C=0.01, dual=True, loss=hinge, penalty=l2, tol=1e-05)
0.75


Optimization Progress:  34%|███▍      | 41/120 [05:12<2:03:54, 94.10s/pipeline]

Generation 1 - Current best internal CV score: 0.8921212121212122


Optimization Progress:  51%|█████     | 61/120 [05:21<45:53, 46.66s/pipeline]  

Generation 2 - Current best internal CV score: 0.8921212121212122


Optimization Progress:  68%|██████▊   | 81/120 [05:35<09:07, 14.05s/pipeline]

Generation 3 - Current best internal CV score: 0.9109090909090909


Optimization Progress:  85%|████████▌ | 102/120 [10:38<09:45, 32.52s/pipeline]

Generation 4 - Current best internal CV score: 0.9109090909090909


                                                                              

Generation 5 - Current best internal CV score: 0.9109090909090909

Best pipeline: LogisticRegression(input_matrix, C=5.0, dual=False, penalty=l2)
0.7916666666666666


Optimization Progress:  36%|███▌      | 43/120 [10:04<1:45:38, 82.32s/pipeline]

Generation 1 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  53%|█████▎    | 64/120 [15:05<1:50:32, 118.44s/pipeline]

Generation 2 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  70%|███████   | 84/120 [16:08<46:02, 76.72s/pipeline]   

Generation 3 - Current best internal CV score: 0.9333333333333333


Optimization Progress:  87%|████████▋ | 104/120 [16:22<11:09, 41.87s/pipeline]

Generation 4 - Current best internal CV score: 0.9384615384615385


                                                                              

Generation 5 - Current best internal CV score: 0.9384615384615385

Best pipeline: LinearSVC(MinMaxScaler(input_matrix), C=0.5, dual=True, loss=hinge, penalty=l2, tol=0.001)
0.7333333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:30<16:51, 12.64s/pipeline]

Generation 1 - Current best internal CV score: 0.9128205128205128


Optimization Progress:  50%|█████     | 60/120 [00:36<07:07,  7.12s/pipeline]

Generation 2 - Current best internal CV score: 0.9128205128205128


Optimization Progress:  67%|██████▋   | 80/120 [00:46<03:12,  4.81s/pipeline]

Generation 3 - Current best internal CV score: 0.9128205128205128


Optimization Progress:  83%|████████▎ | 100/120 [00:52<00:43,  2.18s/pipeline]

Generation 4 - Current best internal CV score: 0.9282051282051282


                                                                              

Generation 5 - Current best internal CV score: 0.9282051282051282

Best pipeline: LinearSVC(CombineDFs(input_matrix, SelectPercentile(input_matrix, percentile=68)), C=0.1, dual=True, loss=hinge, penalty=l2, tol=0.0001)
0.6833333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:28<13:49, 10.37s/pipeline]

Generation 1 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  50%|█████     | 60/120 [01:00<14:15, 14.27s/pipeline]

Generation 2 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  67%|██████▋   | 80/120 [01:11<05:16,  7.90s/pipeline]

Generation 3 - Current best internal CV score: 0.8666666666666666


Optimization Progress:  83%|████████▎ | 100/120 [04:26<20:14, 60.71s/pipeline]

Generation 4 - Current best internal CV score: 0.8923076923076924


                                                                              

Generation 5 - Current best internal CV score: 0.8923076923076924

Best pipeline: LogisticRegression(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), C=20.0, dual=False, penalty=l1)
0.75


Optimization Progress:  33%|███▎      | 40/120 [01:15<31:47, 23.84s/pipeline] 

Generation 1 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  50%|█████     | 60/120 [01:53<17:25, 17.42s/pipeline]

Generation 2 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  67%|██████▋   | 80/120 [02:13<07:39, 11.50s/pipeline]

Generation 3 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  83%|████████▎ | 100/120 [02:18<02:04,  6.22s/pipeline]

Generation 4 - Current best internal CV score: 0.9025641025641026


                                                                              

Generation 5 - Current best internal CV score: 0.9025641025641026

Best pipeline: LogisticRegression(BernoulliNB(input_matrix, alpha=0.1, fit_prior=True), C=10.0, dual=False, penalty=l1)
0.7833333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:33<13:43, 10.29s/pipeline]

Generation 1 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  50%|█████     | 60/120 [00:44<06:05,  6.10s/pipeline]

Generation 2 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  67%|██████▋   | 80/120 [01:50<14:57, 22.43s/pipeline]

Generation 3 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  83%|████████▎ | 100/120 [02:07<05:19, 15.96s/pipeline]

Generation 4 - Current best internal CV score: 0.8974358974358975


                                                                              

Generation 5 - Current best internal CV score: 0.9025641025641026

Best pipeline: LogisticRegression(ZeroCount(input_matrix), C=0.5, dual=False, penalty=l2)
0.7833333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:58<20:14, 15.18s/pipeline]

Generation 1 - Current best internal CV score: 0.8820512820512821


Optimization Progress:  50%|█████     | 60/120 [01:37<22:33, 22.56s/pipeline]

Generation 2 - Current best internal CV score: 0.8820512820512821


Optimization Progress:  67%|██████▋   | 80/120 [01:45<08:49, 13.23s/pipeline]

Generation 3 - Current best internal CV score: 0.8820512820512821


Optimization Progress:  83%|████████▎ | 100/120 [01:53<02:32,  7.62s/pipeline]

Generation 4 - Current best internal CV score: 0.8871794871794872


                                                                              

Generation 5 - Current best internal CV score: 0.8871794871794872

Best pipeline: LogisticRegression(CombineDFs(input_matrix, input_matrix), C=20.0, dual=False, penalty=l2)
0.7333333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:45<13:23, 10.04s/pipeline]

Generation 1 - Current best internal CV score: 0.876923076923077


Optimization Progress:  50%|█████     | 60/120 [00:58<06:47,  6.79s/pipeline]

Generation 2 - Current best internal CV score: 0.876923076923077


Optimization Progress:  68%|██████▊   | 81/120 [06:01<31:38, 48.67s/pipeline]

Generation 3 - Current best internal CV score: 0.876923076923077


Optimization Progress:  84%|████████▍ | 101/120 [06:11<05:30, 17.38s/pipeline]

Generation 4 - Current best internal CV score: 0.8871794871794872


                                                                              

Generation 5 - Current best internal CV score: 0.8871794871794872

Best pipeline: GradientBoostingClassifier(LogisticRegression(input_matrix, C=0.1, dual=True, penalty=l2), learning_rate=0.01, max_depth=4, max_features=0.8, min_samples_leaf=17, min_samples_split=20, n_estimators=100, subsample=0.8)
0.7


Optimization Progress:  33%|███▎      | 40/120 [00:23<11:07,  8.35s/pipeline]

Generation 1 - Current best internal CV score: 0.8974358974358975


Optimization Progress:  51%|█████     | 61/120 [05:46<51:18, 52.18s/pipeline]  

Generation 2 - Current best internal CV score: 0.8974358974358975


Optimization Progress:  68%|██████▊   | 81/120 [06:00<14:05, 21.68s/pipeline]

Generation 3 - Current best internal CV score: 0.8974358974358975


Optimization Progress:  84%|████████▍ | 101/120 [06:20<05:09, 16.30s/pipeline]

Generation 4 - Current best internal CV score: 0.8974358974358975


                                                                              

Generation 5 - Current best internal CV score: 0.9076923076923077

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=20.0, dual=False, penalty=l2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:29<14:58, 11.24s/pipeline]

Generation 1 - Current best internal CV score: 0.8923076923076924


Optimization Progress:  50%|█████     | 60/120 [01:34<24:42, 24.70s/pipeline]

Generation 2 - Current best internal CV score: 0.9128205128205129


Optimization Progress:  67%|██████▋   | 80/120 [03:10<24:48, 37.21s/pipeline]

Generation 3 - Current best internal CV score: 0.9128205128205129


Optimization Progress:  83%|████████▎ | 100/120 [05:52<15:29, 46.48s/pipeline]

Generation 4 - Current best internal CV score: 0.9128205128205129


                                                                              

Generation 5 - Current best internal CV score: 0.9128205128205129

Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=True, penalty=l2)
0.7166666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:12<04:55,  3.69s/pipeline]

Generation 1 - Current best internal CV score: 0.876923076923077


Optimization Progress:  50%|█████     | 60/120 [01:10<12:49, 12.82s/pipeline]

Generation 2 - Current best internal CV score: 0.876923076923077


Optimization Progress:  67%|██████▋   | 80/120 [01:53<10:09, 15.24s/pipeline]

Generation 3 - Current best internal CV score: 0.8871794871794872


Optimization Progress:  83%|████████▎ | 100/120 [02:42<04:11, 12.56s/pipeline]

Generation 4 - Current best internal CV score: 0.8871794871794872


                                                                              

Generation 5 - Current best internal CV score: 0.8871794871794872

Best pipeline: GaussianNB(LinearSVC(input_matrix, C=25.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.001))
0.8


Optimization Progress:  34%|███▍      | 41/120 [06:18<2:25:27, 110.47s/pipeline]

Generation 1 - Current best internal CV score: 0.8128205128205128


Optimization Progress:  51%|█████     | 61/120 [06:48<57:28, 58.46s/pipeline]   

Generation 2 - Current best internal CV score: 0.8179487179487179


Optimization Progress:  68%|██████▊   | 81/120 [07:22<21:54, 33.71s/pipeline]

Generation 3 - Current best internal CV score: 0.8179487179487179


Optimization Progress:  84%|████████▍ | 101/120 [08:17<10:19, 32.61s/pipeline]

Generation 4 - Current best internal CV score: 0.823076923076923


                                                                              

Generation 5 - Current best internal CV score: 0.8282051282051283

Best pipeline: LogisticRegression(ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.25, min_samples_leaf=12, min_samples_split=7, n_estimators=100), C=1.0, dual=False, penalty=l2)
0.7166666666666667


Optimization Progress:  34%|███▍      | 41/120 [08:51<4:04:55, 186.02s/pipeline]

Generation 1 - Current best internal CV score: 0.823076923076923


Optimization Progress:  52%|█████▏    | 62/120 [13:56<1:50:11, 113.99s/pipeline]

Generation 2 - Current best internal CV score: 0.8307692307692307


Optimization Progress:  68%|██████▊   | 82/120 [15:44<44:59, 71.05s/pipeline]   

Generation 3 - Current best internal CV score: 0.8410256410256409


Optimization Progress:  85%|████████▌ | 102/120 [16:40<12:56, 43.16s/pipeline]

Generation 4 - Current best internal CV score: 0.8410256410256409


                                                                              

Generation 5 - Current best internal CV score: 0.8410256410256409

Best pipeline: LogisticRegression(MaxAbsScaler(RobustScaler(input_matrix)), C=0.5, dual=False, penalty=l2)
0.6416666666666667


Optimization Progress:  33%|███▎      | 40/120 [06:39<2:44:35, 123.44s/pipeline]

Generation 1 - Current best internal CV score: 0.8


Optimization Progress:  50%|█████     | 60/120 [09:35<1:39:38, 99.64s/pipeline] 

Generation 2 - Current best internal CV score: 0.8


Optimization Progress:  67%|██████▋   | 80/120 [10:00<37:14, 55.86s/pipeline]  

Generation 3 - Current best internal CV score: 0.8


Optimization Progress:  83%|████████▎ | 100/120 [11:01<15:11, 45.57s/pipeline]

Generation 4 - Current best internal CV score: 0.8051282051282052


                                                                              

Generation 5 - Current best internal CV score: 0.8205128205128206

Best pipeline: LogisticRegression(KNeighborsClassifier(input_matrix, n_neighbors=28, p=2, weights=uniform), C=0.1, dual=True, penalty=l2)
0.6833333333333333


Optimization Progress:  33%|███▎      | 40/120 [02:12<56:07, 42.09s/pipeline]  

Generation 1 - Current best internal CV score: 0.7512820512820514


Optimization Progress:  50%|█████     | 60/120 [02:26<21:33, 21.56s/pipeline]

Generation 2 - Current best internal CV score: 0.817948717948718


Optimization Progress:  67%|██████▋   | 80/120 [02:48<07:01, 10.53s/pipeline]

Generation 3 - Current best internal CV score: 0.817948717948718


Optimization Progress:  84%|████████▍ | 101/120 [07:50<30:19, 95.75s/pipeline] 

Generation 4 - Current best internal CV score: 0.817948717948718


                                                                              

Generation 5 - Current best internal CV score: 0.817948717948718

Best pipeline: GradientBoostingClassifier(LogisticRegression(input_matrix, C=1.0, dual=False, penalty=l1), learning_rate=0.001, max_depth=8, max_features=0.7000000000000001, min_samples_leaf=18, min_samples_split=19, n_estimators=100, subsample=0.35000000000000003)
0.6916666666666667


Optimization Progress:  35%|███▌      | 42/120 [10:05<2:55:58, 135.37s/pipeline]

Generation 1 - Current best internal CV score: 0.8358974358974358


Optimization Progress:  52%|█████▏    | 62/120 [11:05<1:40:18, 103.78s/pipeline]

Generation 2 - Current best internal CV score: 0.8384615384615384


Optimization Progress:  68%|██████▊   | 82/120 [11:37<19:57, 31.51s/pipeline]   

Generation 3 - Current best internal CV score: 0.8461538461538461


Optimization Progress:  86%|████████▌ | 103/120 [16:50<19:32, 68.99s/pipeline]

Generation 4 - Current best internal CV score: 0.8564102564102564


                                                                              

Generation 5 - Current best internal CV score: 0.8564102564102564

Best pipeline: LogisticRegression(LogisticRegression(input_matrix, C=20.0, dual=True, penalty=l2), C=0.01, dual=True, penalty=l2)
0.7333333333333333


Optimization Progress:  33%|███▎      | 40/120 [01:22<41:10, 30.89s/pipeline] 

Generation 1 - Current best internal CV score: 0.8153846153846154


Optimization Progress:  51%|█████     | 61/120 [06:25<1:39:38, 101.32s/pipeline]

Generation 2 - Current best internal CV score: 0.8179487179487179


Optimization Progress:  68%|██████▊   | 81/120 [07:32<35:22, 54.42s/pipeline]   

Generation 3 - Current best internal CV score: 0.8307692307692308


Optimization Progress:  84%|████████▍ | 101/120 [07:46<07:02, 22.26s/pipeline]

Generation 4 - Current best internal CV score: 0.8307692307692308


                                                                              

Generation 5 - Current best internal CV score: 0.8307692307692308

Best pipeline: LinearSVC(GaussianNB(input_matrix), C=0.001, dual=False, loss=squared_hinge, penalty=l2, tol=0.01)
0.6833333333333333


Optimization Progress:  35%|███▌      | 42/120 [08:48<3:38:40, 168.21s/pipeline]

Generation 1 - Current best internal CV score: 0.8307692307692307


Optimization Progress:  52%|█████▎    | 63/120 [13:51<2:44:28, 173.13s/pipeline]

Generation 2 - Current best internal CV score: 0.8358974358974359


Optimization Progress:  69%|██████▉   | 83/120 [14:35<1:00:11, 97.61s/pipeline] 

Generation 3 - Current best internal CV score: 0.8358974358974359


Optimization Progress:  87%|████████▋ | 104/120 [19:39<21:03, 78.96s/pipeline] 

Generation 4 - Current best internal CV score: 0.8358974358974359


                                                                              

Generation 5 - Current best internal CV score: 0.8435897435897436

Best pipeline: ExtraTreesClassifier(LogisticRegression(LogisticRegression(input_matrix, C=0.5, dual=False, penalty=l2), C=0.5, dual=False, penalty=l2), bootstrap=True, criterion=gini, max_features=0.35000000000000003, min_samples_leaf=5, min_samples_split=7, n_estimators=100)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [02:18<1:18:52, 59.16s/pipeline]

Generation 1 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  50%|█████     | 60/120 [04:47<1:13:10, 73.17s/pipeline]

Generation 2 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  68%|██████▊   | 81/120 [09:50<1:22:21, 126.71s/pipeline]

Generation 3 - Current best internal CV score: 0.8282051282051283


Optimization Progress:  85%|████████▌ | 102/120 [15:04<46:52, 156.27s/pipeline] 

Generation 4 - Current best internal CV score: 0.8282051282051283


                                                                               

Generation 5 - Current best internal CV score: 0.8307692307692307

Best pipeline: LinearSVC(GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=1, max_features=0.3, min_samples_leaf=14, min_samples_split=18, n_estimators=100, subsample=0.9500000000000001), C=0.001, dual=True, loss=squared_hinge, penalty=l2, tol=1e-05)
0.675


Optimization Progress:  33%|███▎      | 40/120 [03:08<1:08:04, 51.06s/pipeline]

Generation 1 - Current best internal CV score: 0.8051282051282052


Optimization Progress:  50%|█████     | 60/120 [03:47<25:39, 25.65s/pipeline]  

Generation 2 - Current best internal CV score: 0.8051282051282052


Optimization Progress:  67%|██████▋   | 80/120 [04:34<21:13, 31.83s/pipeline]

Generation 3 - Current best internal CV score: 0.8051282051282052


Optimization Progress:  83%|████████▎ | 100/120 [07:42<10:13, 30.67s/pipeline]

Generation 4 - Current best internal CV score: 0.8076923076923077


                                                                              

Generation 5 - Current best internal CV score: 0.8076923076923077

Best pipeline: XGBClassifier(LogisticRegression(GaussianNB(input_matrix), C=25.0, dual=False, penalty=l2), learning_rate=0.1, max_depth=3, min_child_weight=15, n_estimators=100, nthread=1, subsample=0.8)


  if diff:


0.6833333333333333


Optimization Progress:  37%|███▋      | 44/120 [10:06<2:00:18, 94.97s/pipeline]

Generation 1 - Current best internal CV score: 0.7794871794871794


Optimization Progress:  53%|█████▎    | 64/120 [12:08<1:17:06, 82.62s/pipeline]

Generation 2 - Current best internal CV score: 0.7974358974358974


Optimization Progress:  71%|███████   | 85/120 [17:40<36:52, 63.20s/pipeline]  

Generation 3 - Current best internal CV score: 0.8


Optimization Progress:  88%|████████▊ | 105/120 [18:33<14:59, 59.99s/pipeline]

Generation 4 - Current best internal CV score: 0.8


                                                                              

Generation 5 - Current best internal CV score: 0.8025641025641026

Best pipeline: GradientBoostingClassifier(GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=4, max_features=0.8, min_samples_leaf=17, min_samples_split=11, n_estimators=100, subsample=0.1), learning_rate=1.0, max_depth=1, max_features=0.9000000000000001, min_samples_leaf=12, min_samples_split=16, n_estimators=100, subsample=1.0)
0.6666666666666666


In [6]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)

    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 18 (of 24 )
Accuary:  75.0


  if diff:
  if diff:


Number of matches: 20 (of 24 )
Accuary:  83.33333333333334
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 17 (of 24 )
Accuary:  70.83333333333334
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 36 (of 48 )
Accuary:  75.0
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 35 (of 48 )
Accuary:  72.91666666666666
Number of matches: 33 (of 48 )
Accuary:  68.75
Number of matches: 34 (of 48 )
Accuary:  70.83333333333334
Number of matches: 37 (of 48 )
Accuary:  77.08333333333334
Number of matches: 36 (of 48 )
Accuary:  75.0
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 36 (of 48 )
Accuary:  75.0
Nu

  if diff:


Number of matches: 82 (of 120 )
Accuary:  68.33333333333333
Number of matches: 71 (of 120 )
Accuary:  59.166666666666664


In [7]:
len(unmatched)

719

In [8]:
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

Comprehension/Rest 136
Comprehension/Syntax 152
Rest/Syntax 431


In [9]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [10]:
list1, list2, list3, list4 = split_list(collectAccuracies,numAggLevels)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_act','4_groups_act','5_groups_act','10_groups_act']

collectedAccuracies.to_csv('triangulate_Act.csv',sep=';',decimal='.', index=False)