In [10]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data and display first 5 rows
z_data = pd.read_csv('esem_act_deact_zscore_groups.csv',sep=';',decimal='.')
z_data.drop(list(z_data.filter(regex = '_act')),axis = 1, inplace = True)

z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)

numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 13 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [11]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')

    return groupedAgg,labels

In [12]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    #first, drop everything non-numeric ===========================================================
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)
    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [13]:
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high

In [14]:
#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 4
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    folder = 'Deact/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs=20)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
    #=============================================================================================================    
        
    offset = offset + 1

Optimization Progress:  33%|███▎      | 40/120 [00:11<02:48,  2.10s/pipeline]

Generation 1 - Current best internal CV score: 0.6822222222222222


Optimization Progress:  50%|█████     | 60/120 [00:15<01:23,  1.39s/pipeline]

Generation 2 - Current best internal CV score: 0.6822222222222222


Optimization Progress:  67%|██████▋   | 80/120 [00:17<00:32,  1.22pipeline/s]

Generation 3 - Current best internal CV score: 0.6888888888888889


Optimization Progress:  83%|████████▎ | 100/120 [00:19<00:13,  1.43pipeline/s]

Generation 4 - Current best internal CV score: 0.72


                                                                              

Generation 5 - Current best internal CV score: 0.72

Best pipeline: GaussianNB(LogisticRegression(input_matrix, C=10.0, dual=False, penalty=l1))
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:06<01:30,  1.13s/pipeline]

Generation 1 - Current best internal CV score: 0.7288888888888889


Optimization Progress:  50%|█████     | 60/120 [00:16<00:58,  1.02pipeline/s]

Generation 2 - Current best internal CV score: 0.7288888888888889


Optimization Progress:  67%|██████▋   | 80/120 [00:21<00:46,  1.15s/pipeline]

Generation 3 - Current best internal CV score: 0.7288888888888889


Optimization Progress:  83%|████████▎ | 100/120 [00:33<00:34,  1.74s/pipeline]

Generation 4 - Current best internal CV score: 0.7511111111111111


                                                                              

Generation 5 - Current best internal CV score: 0.7511111111111111

Best pipeline: XGBClassifier(RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.55, min_samples_leaf=17, min_samples_split=19, n_estimators=100), learning_rate=0.5, max_depth=8, min_child_weight=8, n_estimators=100, nthread=1, subsample=0.6000000000000001)


  if diff:


0.5833333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:06<01:54,  1.44s/pipeline]

Generation 1 - Current best internal CV score: 0.7155555555555556


Optimization Progress:  50%|█████     | 60/120 [00:08<00:45,  1.33pipeline/s]

Generation 2 - Current best internal CV score: 0.7155555555555556


Optimization Progress:  67%|██████▋   | 80/120 [00:43<05:13,  7.83s/pipeline]

Generation 3 - Current best internal CV score: 0.7266666666666667


Optimization Progress:  83%|████████▎ | 100/120 [00:48<01:57,  5.86s/pipeline]

Generation 4 - Current best internal CV score: 0.7266666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.74

Best pipeline: LinearSVC(CombineDFs(input_matrix, input_matrix), C=1.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.001)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:46<19:17, 14.47s/pipeline]

Generation 1 - Current best internal CV score: 0.7


Optimization Progress:  50%|█████     | 60/120 [00:50<07:43,  7.72s/pipeline]

Generation 2 - Current best internal CV score: 0.7


Optimization Progress:  67%|██████▋   | 80/120 [01:03<02:37,  3.94s/pipeline]

Generation 3 - Current best internal CV score: 0.7


Optimization Progress:  83%|████████▎ | 100/120 [01:05<00:40,  2.02s/pipeline]

Generation 4 - Current best internal CV score: 0.7


                                                                              

Generation 5 - Current best internal CV score: 0.7

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.001, max_depth=8, max_features=0.3, min_samples_leaf=15, min_samples_split=16, n_estimators=100, subsample=0.5)
0.4583333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:04<01:22,  1.03s/pipeline]

Generation 1 - Current best internal CV score: 0.7511111111111111


Optimization Progress:  50%|█████     | 60/120 [00:17<01:44,  1.75s/pipeline]

Generation 2 - Current best internal CV score: 0.7533333333333333


Optimization Progress:  67%|██████▋   | 80/120 [00:20<00:53,  1.33s/pipeline]

Generation 3 - Current best internal CV score: 0.7533333333333333


Optimization Progress:  83%|████████▎ | 100/120 [00:33<00:22,  1.14s/pipeline]

Generation 4 - Current best internal CV score: 0.7666666666666667


  if diff:


Generation 5 - Current best internal CV score: 0.7666666666666667

Best pipeline: XGBClassifier(LogisticRegression(input_matrix, C=5.0, dual=False, penalty=l1), learning_rate=0.01, max_depth=1, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.35000000000000003)
0.75


Optimization Progress:  33%|███▎      | 40/120 [00:06<02:51,  2.14s/pipeline]

Generation 1 - Current best internal CV score: 0.6977777777777778


Optimization Progress:  50%|█████     | 60/120 [00:14<02:03,  2.06s/pipeline]

Generation 2 - Current best internal CV score: 0.6977777777777778


Optimization Progress:  67%|██████▋   | 80/120 [00:15<00:44,  1.12s/pipeline]

Generation 3 - Current best internal CV score: 0.6977777777777778


Optimization Progress:  83%|████████▎ | 100/120 [00:23<00:25,  1.26s/pipeline]

Generation 4 - Current best internal CV score: 0.6977777777777778


                                                                              

Generation 5 - Current best internal CV score: 0.6977777777777778

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
0.5416666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:52<10:42,  8.03s/pipeline]

Generation 1 - Current best internal CV score: 0.7066666666666666


Optimization Progress:  50%|█████     | 60/120 [00:56<04:16,  4.28s/pipeline]

Generation 2 - Current best internal CV score: 0.7155555555555557


Optimization Progress:  67%|██████▋   | 80/120 [00:59<01:50,  2.76s/pipeline]

Generation 3 - Current best internal CV score: 0.7155555555555557


Optimization Progress:  83%|████████▎ | 100/120 [01:03<00:37,  1.85s/pipeline]

Generation 4 - Current best internal CV score: 0.7155555555555557


                                                                              

Generation 5 - Current best internal CV score: 0.7155555555555557

Best pipeline: ExtraTreesClassifier(LogisticRegression(input_matrix, C=5.0, dual=False, penalty=l2), bootstrap=False, criterion=gini, max_features=0.9000000000000001, min_samples_leaf=14, min_samples_split=18, n_estimators=100)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:26<07:45,  5.81s/pipeline]

Generation 1 - Current best internal CV score: 0.7177777777777778


Optimization Progress:  50%|█████     | 60/120 [00:30<03:10,  3.18s/pipeline]

Generation 2 - Current best internal CV score: 0.7177777777777778


Optimization Progress:  67%|██████▋   | 80/120 [00:34<01:18,  1.97s/pipeline]

Generation 3 - Current best internal CV score: 0.7177777777777778


Optimization Progress:  83%|████████▎ | 100/120 [00:42<00:30,  1.51s/pipeline]

Generation 4 - Current best internal CV score: 0.72


                                                                              

Generation 5 - Current best internal CV score: 0.72

Best pipeline: LinearSVC(CombineDFs(input_matrix, input_matrix), C=15.0, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05)
0.7083333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:06<00:47,  1.68pipeline/s]

Generation 1 - Current best internal CV score: 0.7022222222222222


Optimization Progress:  50%|█████     | 60/120 [00:10<00:39,  1.52pipeline/s]

Generation 2 - Current best internal CV score: 0.7022222222222222


Optimization Progress:  67%|██████▋   | 80/120 [00:14<00:16,  2.42pipeline/s]

Generation 3 - Current best internal CV score: 0.7599999999999999


Optimization Progress:  83%|████████▎ | 100/120 [00:16<00:06,  2.90pipeline/s]

Generation 4 - Current best internal CV score: 0.7599999999999999


                                                                              

Generation 5 - Current best internal CV score: 0.7599999999999999

Best pipeline: LinearSVC(input_matrix, C=1.0, dual=False, loss=squared_hinge, penalty=l2, tol=0.0001)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:04<00:34,  2.30pipeline/s]

Generation 1 - Current best internal CV score: 0.7066666666666667


Optimization Progress:  50%|█████     | 60/120 [00:08<00:27,  2.14pipeline/s]

Generation 2 - Current best internal CV score: 0.7066666666666667


Optimization Progress:  67%|██████▋   | 80/120 [00:10<00:14,  2.85pipeline/s]

Generation 3 - Current best internal CV score: 0.7333333333333334


Optimization Progress:  83%|████████▎ | 100/120 [00:50<02:03,  6.18s/pipeline]

Generation 4 - Current best internal CV score: 0.7488888888888889


                                                                              

Generation 5 - Current best internal CV score: 0.7488888888888889

Best pipeline: LogisticRegression(MinMaxScaler(input_matrix), C=15.0, dual=True, penalty=l2)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:56<21:31, 16.14s/pipeline] 

Generation 1 - Current best internal CV score: 0.6284848484848485


Optimization Progress:  50%|█████     | 60/120 [01:56<20:13, 20.23s/pipeline]

Generation 2 - Current best internal CV score: 0.6399999999999999


Optimization Progress:  67%|██████▋   | 80/120 [02:27<12:29, 18.73s/pipeline]

Generation 3 - Current best internal CV score: 0.6533333333333333


Optimization Progress:  83%|████████▎ | 100/120 [03:13<05:19, 16.00s/pipeline]

Generation 4 - Current best internal CV score: 0.6533333333333333


                                                                              

Generation 5 - Current best internal CV score: 0.6533333333333333

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.1, min_samples_leaf=6, min_samples_split=15, n_estimators=100)
0.5


Optimization Progress:  33%|███▎      | 40/120 [01:37<52:02, 39.03s/pipeline]  

Generation 1 - Current best internal CV score: 0.6781818181818182


Optimization Progress:  50%|█████     | 60/120 [01:49<22:27, 22.46s/pipeline]

Generation 2 - Current best internal CV score: 0.6781818181818182


Optimization Progress:  67%|██████▋   | 80/120 [04:30<38:59, 58.49s/pipeline]

Generation 3 - Current best internal CV score: 0.6781818181818182


Optimization Progress:  83%|████████▎ | 100/120 [09:28<38:59, 116.96s/pipeline] 

Generation 4 - Current best internal CV score: 0.6781818181818182


                                                                               

Generation 5 - Current best internal CV score: 0.6781818181818182

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.4, min_samples_leaf=18, min_samples_split=5, n_estimators=100)
0.4583333333333333


Optimization Progress:  33%|███▎      | 40/120 [03:55<1:37:48, 73.35s/pipeline]

Generation 1 - Current best internal CV score: 0.6709090909090909


Optimization Progress:  50%|█████     | 60/120 [07:34<1:41:16, 101.27s/pipeline]

Generation 2 - Current best internal CV score: 0.6775757575757576


Optimization Progress:  68%|██████▊   | 81/120 [12:36<1:21:16, 125.04s/pipeline]

Generation 3 - Current best internal CV score: 0.6775757575757576


Optimization Progress:  84%|████████▍ | 101/120 [17:24<46:43, 147.55s/pipeline] 

Generation 4 - Current best internal CV score: 0.6775757575757576


                                                                               

Generation 5 - Current best internal CV score: 0.6781818181818181

Best pipeline: LogisticRegression(input_matrix, C=10.0, dual=False, penalty=l2)
0.6458333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:39<1:00:09, 45.11s/pipeline]

Generation 1 - Current best internal CV score: 0.615151515151515


Optimization Progress:  50%|█████     | 60/120 [01:45<23:36, 23.61s/pipeline]  

Generation 2 - Current best internal CV score: 0.615151515151515


Optimization Progress:  67%|██████▋   | 80/120 [01:49<08:01, 12.03s/pipeline]

Generation 3 - Current best internal CV score: 0.615151515151515


Optimization Progress:  83%|████████▎ | 100/120 [02:20<03:30, 10.50s/pipeline]

Generation 4 - Current best internal CV score: 0.615151515151515


                                                                              

Generation 5 - Current best internal CV score: 0.615151515151515

Best pipeline: BernoulliNB(input_matrix, alpha=100.0, fit_prior=False)
0.4791666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:16<05:32,  4.16s/pipeline]

Generation 1 - Current best internal CV score: 0.6757575757575758


Optimization Progress:  50%|█████     | 60/120 [00:51<12:32, 12.54s/pipeline]

Generation 2 - Current best internal CV score: 0.6757575757575758


Optimization Progress:  67%|██████▋   | 80/120 [01:02<03:35,  5.39s/pipeline]

Generation 3 - Current best internal CV score: 0.681212121212121


Optimization Progress:  83%|████████▎ | 100/120 [01:09<01:23,  4.17s/pipeline]

Generation 4 - Current best internal CV score: 0.681212121212121


                                                                              

Generation 5 - Current best internal CV score: 0.681212121212121

Best pipeline: LinearSVC(MaxAbsScaler(input_matrix), C=0.1, dual=False, loss=squared_hinge, penalty=l1, tol=0.1)
0.5625


Optimization Progress:  33%|███▎      | 40/120 [00:37<14:04, 10.56s/pipeline]

Generation 1 - Current best internal CV score: 0.6775757575757575


Optimization Progress:  50%|█████     | 60/120 [00:40<04:00,  4.01s/pipeline]

Generation 2 - Current best internal CV score: 0.6775757575757575


Optimization Progress:  67%|██████▋   | 80/120 [01:14<05:12,  7.82s/pipeline]

Generation 3 - Current best internal CV score: 0.6775757575757575


Optimization Progress:  83%|████████▎ | 100/120 [01:47<02:22,  7.14s/pipeline]

Generation 4 - Current best internal CV score: 0.6775757575757575


                                                                              

Generation 5 - Current best internal CV score: 0.6775757575757575

Best pipeline: LogisticRegression(input_matrix, C=25.0, dual=False, penalty=l1)
0.7916666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:08<03:02,  2.28s/pipeline]

Generation 1 - Current best internal CV score: 0.6575757575757576


Optimization Progress:  50%|█████     | 60/120 [00:33<07:55,  7.92s/pipeline]

Generation 2 - Current best internal CV score: 0.6575757575757576


Optimization Progress:  67%|██████▋   | 80/120 [00:47<05:08,  7.70s/pipeline]

Generation 3 - Current best internal CV score: 0.6684848484848483


Optimization Progress:  83%|████████▎ | 100/120 [01:10<03:30, 10.53s/pipeline]

Generation 4 - Current best internal CV score: 0.6684848484848483


                                                                              

Generation 5 - Current best internal CV score: 0.6684848484848483

Best pipeline: LogisticRegression(ZeroCount(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.45, min_samples_leaf=6, min_samples_split=4, n_estimators=100)), C=0.01, dual=False, penalty=l2)
0.6041666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:20<09:41,  7.27s/pipeline]

Generation 1 - Current best internal CV score: 0.6593939393939394


Optimization Progress:  50%|█████     | 60/120 [01:42<15:46, 15.77s/pipeline]

Generation 2 - Current best internal CV score: 0.66


Optimization Progress:  67%|██████▋   | 80/120 [01:50<06:23,  9.58s/pipeline]

Generation 3 - Current best internal CV score: 0.66


Optimization Progress:  83%|████████▎ | 100/120 [01:59<01:59,  5.98s/pipeline]

Generation 4 - Current best internal CV score: 0.6666666666666666


                                                                              

Generation 5 - Current best internal CV score: 0.6666666666666666

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.25, min_samples_leaf=3, min_samples_split=9, n_estimators=100)
0.4583333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:18<04:30,  3.38s/pipeline]

Generation 1 - Current best internal CV score: 0.6539393939393939


Optimization Progress:  50%|█████     | 60/120 [00:50<12:00, 12.00s/pipeline]

Generation 2 - Current best internal CV score: 0.6842424242424243


Optimization Progress:  67%|██████▋   | 80/120 [00:58<04:25,  6.64s/pipeline]

Generation 3 - Current best internal CV score: 0.6842424242424243


Optimization Progress:  83%|████████▎ | 100/120 [01:31<04:17, 12.88s/pipeline]

Generation 4 - Current best internal CV score: 0.6842424242424243


                                                                              

Generation 5 - Current best internal CV score: 0.6842424242424243

Best pipeline: LinearSVC(input_matrix, C=0.1, dual=False, loss=squared_hinge, penalty=l1, tol=1e-05)
0.5625


Optimization Progress:  33%|███▎      | 40/120 [00:31<11:49,  8.87s/pipeline]

Generation 1 - Current best internal CV score: 0.6612121212121213


Optimization Progress:  50%|█████     | 60/120 [00:39<06:40,  6.67s/pipeline]

Generation 2 - Current best internal CV score: 0.6612121212121213


Optimization Progress:  67%|██████▋   | 80/120 [00:48<02:32,  3.82s/pipeline]

Generation 3 - Current best internal CV score: 0.6666666666666667


Optimization Progress:  83%|████████▎ | 100/120 [00:55<00:35,  1.75s/pipeline]

Generation 4 - Current best internal CV score: 0.6666666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.6666666666666667

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.01, max_depth=9, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=6, n_estimators=100, subsample=0.55)
0.5208333333333334


Optimization Progress:  33%|███▎      | 40/120 [01:31<54:33, 40.92s/pipeline]  

Generation 1 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  50%|█████     | 60/120 [01:51<16:07, 16.13s/pipeline]

Generation 2 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  67%|██████▋   | 80/120 [01:59<07:48, 11.72s/pipeline]

Generation 3 - Current best internal CV score: 0.6564102564102564


Optimization Progress:  83%|████████▎ | 100/120 [02:05<02:03,  6.16s/pipeline]

Generation 4 - Current best internal CV score: 0.6564102564102564


                                                                              

Generation 5 - Current best internal CV score: 0.6564102564102564

Best pipeline: XGBClassifier(LinearSVC(input_matrix, C=5.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.1), learning_rate=0.1, max_depth=2, min_child_weight=13, n_estimators=100, nthread=1, subsample=0.45)


  if diff:


0.6333333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:11<02:50,  2.13s/pipeline]

Generation 1 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  50%|█████     | 60/120 [00:18<02:00,  2.00s/pipeline]

Generation 2 - Current best internal CV score: 0.6512820512820513


Optimization Progress:  67%|██████▋   | 80/120 [00:51<04:57,  7.45s/pipeline]

Generation 3 - Current best internal CV score: 0.6615384615384615


Optimization Progress:  83%|████████▎ | 100/120 [01:00<01:54,  5.75s/pipeline]

Generation 4 - Current best internal CV score: 0.6666666666666666


                                                                              

Generation 5 - Current best internal CV score: 0.6820512820512821

Best pipeline: GradientBoostingClassifier(StandardScaler(DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=5, min_samples_leaf=20, min_samples_split=20)), learning_rate=0.01, max_depth=7, max_features=0.7000000000000001, min_samples_leaf=7, min_samples_split=18, n_estimators=100, subsample=0.6000000000000001)
0.5666666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:13<01:16,  1.04pipeline/s]

Generation 1 - Current best internal CV score: 0.6358974358974359


Optimization Progress:  50%|█████     | 60/120 [00:16<00:51,  1.17pipeline/s]

Generation 2 - Current best internal CV score: 0.6461538461538462


Optimization Progress:  67%|██████▋   | 80/120 [00:17<00:25,  1.60pipeline/s]

Generation 3 - Current best internal CV score: 0.6871794871794872


Optimization Progress:  83%|████████▎ | 100/120 [00:29<00:27,  1.36s/pipeline]

Generation 4 - Current best internal CV score: 0.7025641025641026


                                                                              

Generation 5 - Current best internal CV score: 0.7025641025641026

Best pipeline: LinearSVC(SelectFwe(input_matrix, alpha=0.021), C=25.0, dual=True, loss=hinge, penalty=l2, tol=0.001)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [06:35<2:22:35, 106.94s/pipeline]

Generation 1 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  50%|█████     | 60/120 [07:12<47:28, 47.47s/pipeline]   

Generation 2 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  67%|██████▋   | 80/120 [07:22<17:14, 25.86s/pipeline]

Generation 3 - Current best internal CV score: 0.6666666666666667


Optimization Progress:  83%|████████▎ | 100/120 [07:48<05:13, 15.69s/pipeline]

Generation 4 - Current best internal CV score: 0.6666666666666667


                                                                              

Generation 5 - Current best internal CV score: 0.6717948717948719

Best pipeline: LogisticRegression(MinMaxScaler(LogisticRegression(input_matrix, C=0.1, dual=True, penalty=l2)), C=10.0, dual=True, penalty=l2)
0.6166666666666667


Optimization Progress:  33%|███▎      | 40/120 [00:06<02:16,  1.70s/pipeline]

Generation 1 - Current best internal CV score: 0.6717948717948717


Optimization Progress:  50%|█████     | 60/120 [00:14<01:35,  1.60s/pipeline]

Generation 2 - Current best internal CV score: 0.6717948717948717


Optimization Progress:  67%|██████▋   | 80/120 [00:30<01:32,  2.32s/pipeline]

Generation 3 - Current best internal CV score: 0.676923076923077


Optimization Progress:  83%|████████▎ | 100/120 [00:36<00:39,  1.98s/pipeline]

Generation 4 - Current best internal CV score: 0.676923076923077


                                                                              

Generation 5 - Current best internal CV score: 0.676923076923077

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.15000000000000002, min_samples_leaf=1, min_samples_split=13, n_estimators=100)
0.5833333333333334


Optimization Progress:  33%|███▎      | 40/120 [00:39<11:13,  8.42s/pipeline]

Generation 1 - Current best internal CV score: 0.6564102564102564


Optimization Progress:  50%|█████     | 60/120 [00:44<03:18,  3.32s/pipeline]

Generation 2 - Current best internal CV score: 0.6717948717948719


Optimization Progress:  67%|██████▋   | 80/120 [00:48<01:45,  2.63s/pipeline]

Generation 3 - Current best internal CV score: 0.6717948717948719


Optimization Progress:  83%|████████▎ | 100/120 [00:52<00:33,  1.65s/pipeline]

Generation 4 - Current best internal CV score: 0.6717948717948719


                                                                              

Generation 5 - Current best internal CV score: 0.6769230769230768

Best pipeline: GradientBoostingClassifier(LogisticRegression(input_matrix, C=15.0, dual=False, penalty=l1), learning_rate=0.1, max_depth=6, max_features=0.4, min_samples_leaf=8, min_samples_split=11, n_estimators=100, subsample=0.15000000000000002)
0.65


Optimization Progress:  33%|███▎      | 40/120 [01:29<26:46, 20.09s/pipeline]

Generation 1 - Current best internal CV score: 0.6102564102564102


Optimization Progress:  50%|█████     | 60/120 [01:38<10:38, 10.64s/pipeline]

Generation 2 - Current best internal CV score: 0.6102564102564102


Optimization Progress:  67%|██████▋   | 80/120 [01:58<07:22, 11.07s/pipeline]

Generation 3 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  83%|████████▎ | 100/120 [02:02<01:32,  4.60s/pipeline]

Generation 4 - Current best internal CV score: 0.6410256410256411


                                                                              

Generation 5 - Current best internal CV score: 0.6410256410256411

Best pipeline: LinearSVC(DecisionTreeClassifier(RFE(StandardScaler(input_matrix), criterion=gini, max_features=0.2, n_estimators=100, step=0.3), criterion=entropy, max_depth=3, min_samples_leaf=2, min_samples_split=6), C=1.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
0.5


Optimization Progress:  33%|███▎      | 40/120 [00:09<03:01,  2.27s/pipeline]

Generation 1 - Current best internal CV score: 0.6461538461538462


Optimization Progress:  50%|█████     | 60/120 [00:20<02:05,  2.09s/pipeline]

Generation 2 - Current best internal CV score: 0.6461538461538462


Optimization Progress:  67%|██████▋   | 80/120 [00:33<01:57,  2.95s/pipeline]

Generation 3 - Current best internal CV score: 0.6461538461538462


Optimization Progress:  83%|████████▎ | 100/120 [00:43<01:15,  3.75s/pipeline]

Generation 4 - Current best internal CV score: 0.6615384615384615


                                                                              

Generation 5 - Current best internal CV score: 0.6820512820512821

Best pipeline: LogisticRegression(ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.1, min_samples_leaf=15, min_samples_split=14, n_estimators=100), C=10.0, dual=False, penalty=l1)
0.6666666666666666


Optimization Progress:  33%|███▎      | 40/120 [00:15<05:52,  4.41s/pipeline]

Generation 1 - Current best internal CV score: 0.6


Optimization Progress:  50%|█████     | 60/120 [01:36<26:26, 26.44s/pipeline]

Generation 2 - Current best internal CV score: 0.6000000000000001


Optimization Progress:  67%|██████▋   | 80/120 [02:52<16:38, 24.97s/pipeline]

Generation 3 - Current best internal CV score: 0.6000000000000001


Optimization Progress:  83%|████████▎ | 100/120 [02:55<04:07, 12.36s/pipeline]

Generation 4 - Current best internal CV score: 0.6051282051282051


                                                                              

Generation 5 - Current best internal CV score: 0.6051282051282051

Best pipeline: BernoulliNB(GaussianNB(RobustScaler(input_matrix)), alpha=0.1, fit_prior=False)
0.45


Optimization Progress:  33%|███▎      | 40/120 [00:42<09:35,  7.20s/pipeline]

Generation 1 - Current best internal CV score: 0.6307692307692307


Optimization Progress:  50%|█████     | 60/120 [00:51<04:49,  4.82s/pipeline]

Generation 2 - Current best internal CV score: 0.6307692307692307


Optimization Progress:  67%|██████▋   | 80/120 [01:01<02:10,  3.27s/pipeline]

Generation 3 - Current best internal CV score: 0.6410256410256411


Optimization Progress:  83%|████████▎ | 100/120 [01:12<00:43,  2.19s/pipeline]

Generation 4 - Current best internal CV score: 0.6410256410256411


                                                                              

Generation 5 - Current best internal CV score: 0.6410256410256411

Best pipeline: LinearSVC(SelectFwe(input_matrix, alpha=0.011), C=5.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
0.6333333333333333


Optimization Progress:  33%|███▎      | 40/120 [01:59<28:21, 21.26s/pipeline]

Generation 1 - Current best internal CV score: 0.5897435897435896


Optimization Progress:  50%|█████     | 60/120 [03:43<29:09, 29.16s/pipeline]

Generation 2 - Current best internal CV score: 0.5897435897435896


Optimization Progress:  67%|██████▋   | 80/120 [04:12<16:25, 24.64s/pipeline]

Generation 3 - Current best internal CV score: 0.5897435897435896


Optimization Progress:  83%|████████▎ | 100/120 [04:24<04:25, 13.27s/pipeline]

Generation 4 - Current best internal CV score: 0.5897435897435896


                                                                              

Generation 5 - Current best internal CV score: 0.5897435897435896

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.5, max_depth=2, min_child_weight=17, n_estimators=100, nthread=1, subsample=0.8500000000000001)


  if diff:


0.55


Optimization Progress:  33%|███▎      | 40/120 [00:46<27:28, 20.61s/pipeline] 

Generation 1 - Current best internal CV score: 0.5538461538461539


Optimization Progress:  50%|█████     | 60/120 [01:00<14:02, 14.04s/pipeline]

Generation 2 - Current best internal CV score: 0.5564102564102564


Optimization Progress:  67%|██████▋   | 80/120 [01:05<02:32,  3.81s/pipeline]

Generation 3 - Current best internal CV score: 0.5564102564102564


Optimization Progress:  83%|████████▎ | 100/120 [01:35<03:49, 11.48s/pipeline]

Generation 4 - Current best internal CV score: 0.5564102564102564


                                                                              

Generation 5 - Current best internal CV score: 0.5641025641025641

Best pipeline: GaussianNB(LinearSVC(ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.2, min_samples_leaf=2, min_samples_split=18, n_estimators=100), C=0.1, dual=True, loss=hinge, penalty=l2, tol=0.1))
0.48333333333333334


Optimization Progress:  33%|███▎      | 40/120 [02:18<43:44, 32.81s/pipeline]  

Generation 1 - Current best internal CV score: 0.5794871794871794


Optimization Progress:  50%|█████     | 60/120 [05:27<1:07:26, 67.43s/pipeline]

Generation 2 - Current best internal CV score: 0.5923076923076923


Optimization Progress:  67%|██████▋   | 80/120 [05:59<28:19, 42.49s/pipeline]  

Generation 3 - Current best internal CV score: 0.5923076923076923


Optimization Progress:  83%|████████▎ | 100/120 [06:07<07:20, 22.02s/pipeline]

Generation 4 - Current best internal CV score: 0.5923076923076923


                                                                              

Generation 5 - Current best internal CV score: 0.5923076923076923

Best pipeline: GradientBoostingClassifier(SelectPercentile(PCA(input_matrix, iterated_power=7, svd_solver=randomized), percentile=65), learning_rate=0.001, max_depth=1, max_features=0.5, min_samples_leaf=13, min_samples_split=20, n_estimators=100, subsample=0.4)
0.5166666666666667


Optimization Progress:  34%|███▍      | 41/120 [05:10<2:03:07, 93.52s/pipeline]

Generation 1 - Current best internal CV score: 0.5923076923076922


Optimization Progress:  51%|█████     | 61/120 [05:23<46:49, 47.62s/pipeline]  

Generation 2 - Current best internal CV score: 0.5974358974358974


Optimization Progress:  68%|██████▊   | 81/120 [05:47<19:36, 30.17s/pipeline]

Generation 3 - Current best internal CV score: 0.5974358974358974


Optimization Progress:  84%|████████▍ | 101/120 [07:05<12:03, 38.05s/pipeline]

Generation 4 - Current best internal CV score: 0.5974358974358974


                                                                              

Generation 5 - Current best internal CV score: 0.6025641025641024

Best pipeline: RandomForestClassifier(LinearSVC(LogisticRegression(input_matrix, C=25.0, dual=False, penalty=l1), C=0.5, dual=False, loss=squared_hinge, penalty=l1, tol=0.1), bootstrap=True, criterion=gini, max_features=0.05, min_samples_leaf=15, min_samples_split=6, n_estimators=100)
0.6333333333333333


Optimization Progress:  33%|███▎      | 40/120 [00:32<15:02, 11.28s/pipeline]

Generation 1 - Current best internal CV score: 0.5615384615384615


Optimization Progress:  50%|█████     | 60/120 [00:56<09:04,  9.07s/pipeline]

Generation 2 - Current best internal CV score: 0.5641025641025641


Optimization Progress:  67%|██████▋   | 80/120 [01:10<03:47,  5.70s/pipeline]

Generation 3 - Current best internal CV score: 0.582051282051282


Optimization Progress:  83%|████████▎ | 100/120 [01:43<02:28,  7.44s/pipeline]

Generation 4 - Current best internal CV score: 0.582051282051282


                                                                              

Generation 5 - Current best internal CV score: 0.582051282051282

Best pipeline: GaussianNB(Normalizer(XGBClassifier(input_matrix, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100, nthread=1, subsample=0.7500000000000001), norm=l1))


  if diff:


0.5583333333333333


Optimization Progress:  33%|███▎      | 40/120 [01:42<36:01, 27.02s/pipeline] 

Generation 1 - Current best internal CV score: 0.5666666666666667


Optimization Progress:  50%|█████     | 60/120 [02:05<15:29, 15.49s/pipeline]

Generation 2 - Current best internal CV score: 0.5666666666666667


Optimization Progress:  67%|██████▋   | 80/120 [02:30<09:49, 14.74s/pipeline]

Generation 3 - Current best internal CV score: 0.5666666666666667


Optimization Progress:  83%|████████▎ | 100/120 [02:41<02:42,  8.12s/pipeline]

Generation 4 - Current best internal CV score: 0.5769230769230769


                                                                              

Generation 5 - Current best internal CV score: 0.5769230769230769

Best pipeline: LogisticRegression(StandardScaler(Normalizer(input_matrix, norm=l1)), C=0.5, dual=False, penalty=l2)
0.6333333333333333


Optimization Progress:  33%|███▎      | 40/120 [05:40<2:48:47, 126.59s/pipeline]

Generation 1 - Current best internal CV score: 0.5743589743589743


Optimization Progress:  50%|█████     | 60/120 [08:04<1:44:48, 104.81s/pipeline]

Generation 2 - Current best internal CV score: 0.5743589743589743


Optimization Progress:  67%|██████▋   | 80/120 [08:57<44:35, 66.88s/pipeline]   

Generation 3 - Current best internal CV score: 0.5743589743589743


Optimization Progress:  84%|████████▍ | 101/120 [14:00<21:36, 68.25s/pipeline]

Generation 4 - Current best internal CV score: 0.5820512820512821


                                                                              

Generation 5 - Current best internal CV score: 0.5820512820512821

Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=20, n_estimators=100)
0.525


Optimization Progress:  33%|███▎      | 40/120 [01:57<52:08, 39.11s/pipeline]  

Generation 1 - Current best internal CV score: 0.5692307692307692


Optimization Progress:  50%|█████     | 60/120 [02:08<16:35, 16.60s/pipeline]

Generation 2 - Current best internal CV score: 0.5692307692307692


Optimization Progress:  67%|██████▋   | 80/120 [03:07<11:12, 16.81s/pipeline]

Generation 3 - Current best internal CV score: 0.582051282051282


Optimization Progress:  83%|████████▎ | 100/120 [03:24<03:34, 10.72s/pipeline]

Generation 4 - Current best internal CV score: 0.582051282051282


                                                                              

Generation 5 - Current best internal CV score: 0.582051282051282

Best pipeline: LinearSVC(SelectPercentile(input_matrix, percentile=21), C=25.0, dual=True, loss=hinge, penalty=l2, tol=0.1)
0.5


Optimization Progress:  33%|███▎      | 40/120 [00:28<09:51,  7.39s/pipeline]

Generation 1 - Current best internal CV score: 0.5487179487179488


Optimization Progress:  50%|█████     | 60/120 [00:47<06:35,  6.59s/pipeline]

Generation 2 - Current best internal CV score: 0.558974358974359


Optimization Progress:  68%|██████▊   | 81/120 [05:48<1:00:41, 93.38s/pipeline]

Generation 3 - Current best internal CV score: 0.558974358974359


Optimization Progress:  84%|████████▍ | 101/120 [05:57<10:53, 34.41s/pipeline] 

Generation 4 - Current best internal CV score: 0.558974358974359


                                                                              

Generation 5 - Current best internal CV score: 0.5692307692307692

Best pipeline: GaussianNB(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=19, n_estimators=100))
0.5583333333333333


Optimization Progress:  33%|███▎      | 40/120 [03:56<1:12:32, 54.41s/pipeline]

Generation 1 - Current best internal CV score: 0.5769230769230769


Optimization Progress:  50%|█████     | 60/120 [06:28<1:00:50, 60.84s/pipeline]

Generation 2 - Current best internal CV score: 0.582051282051282


Optimization Progress:  67%|██████▋   | 80/120 [07:08<27:45, 41.63s/pipeline]  

Generation 3 - Current best internal CV score: 0.582051282051282


Optimization Progress:  83%|████████▎ | 100/120 [07:20<07:21, 22.10s/pipeline]

Generation 4 - Current best internal CV score: 0.582051282051282


                                                                              

Generation 5 - Current best internal CV score: 0.6000000000000001

Best pipeline: LogisticRegression(BernoulliNB(input_matrix, alpha=1.0, fit_prior=False), C=1.0, dual=False, penalty=l1)
0.6166666666666667




In [15]:
offset = z_data.columns.get_loc('aggr_2_groups0')

collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================

    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    # content = [x.strip() for x in content] 

    #  or 'exported_pipeline = ' not in line
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    #         content.remove(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)

    offset = offset + 1

Number of matches: 18 (of 24 )
Accuary:  75.0


  if diff:


Number of matches: 12 (of 24 )
Accuary:  50.0
Number of matches: 19 (of 24 )
Accuary:  79.16666666666666
Number of matches: 12 (of 24 )
Accuary:  50.0


  if diff:


Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 13 (of 24 )
Accuary:  54.166666666666664
Number of matches: 16 (of 24 )
Accuary:  66.66666666666666
Number of matches: 18 (of 24 )
Accuary:  75.0
Number of matches: 16 (of 24 )
Accuary:  66.66666666666666
Number of matches: 15 (of 24 )
Accuary:  62.5
Number of matches: 25 (of 48 )
Accuary:  52.083333333333336
Number of matches: 22 (of 48 )
Accuary:  45.83333333333333
Number of matches: 31 (of 48 )
Accuary:  64.58333333333334
Number of matches: 23 (of 48 )
Accuary:  47.91666666666667
Number of matches: 28 (of 48 )
Accuary:  58.333333333333336
Number of matches: 38 (of 48 )
Accuary:  79.16666666666666
Number of matches: 27 (of 48 )
Accuary:  56.25
Number of matches: 22 (of 48 )
Accuary:  45.83333333333333
Number of matches: 27 (of 48 )
Accuary:  56.25
Number of matches: 26 (of 48 )
Accuary:  54.166666666666664


  if diff:


Number of matches: 38 (of 60 )
Accuary:  63.33333333333333
Number of matches: 33 (of 60 )
Accuary:  55.00000000000001
Number of matches: 39 (of 60 )
Accuary:  65.0
Number of matches: 37 (of 60 )
Accuary:  61.66666666666667
Number of matches: 35 (of 60 )
Accuary:  58.333333333333336
Number of matches: 39 (of 60 )
Accuary:  65.0
Number of matches: 35 (of 60 )
Accuary:  58.333333333333336
Number of matches: 42 (of 60 )
Accuary:  70.0
Number of matches: 27 (of 60 )
Accuary:  45.0
Number of matches: 37 (of 60 )
Accuary:  61.66666666666667


  if diff:


Number of matches: 66 (of 120 )
Accuary:  55.00000000000001
Number of matches: 57 (of 120 )
Accuary:  47.5
Number of matches: 67 (of 120 )
Accuary:  55.833333333333336
Number of matches: 75 (of 120 )
Accuary:  62.5


  if diff:
  if diff:


Number of matches: 67 (of 120 )
Accuary:  55.833333333333336
Number of matches: 76 (of 120 )
Accuary:  63.33333333333333
Number of matches: 61 (of 120 )
Accuary:  50.83333333333333
Number of matches: 60 (of 120 )
Accuary:  50.0
Number of matches: 70 (of 120 )
Accuary:  58.333333333333336
Number of matches: 74 (of 120 )
Accuary:  61.66666666666667


In [16]:
print(len(unmatched))
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

1059
Comprehension/Rest 217
Comprehension/Syntax 444
Rest/Syntax 398


In [17]:
#input: alist: the list to split, wanted_parts: into how many parts it should be split
#returns: all split lists
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]


In [18]:
list1, list2, list3, list4 = split_list(collectAccuracies,numAggLevels)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T

collectedAccuracies.columns = ['2_groups_deact','4_groups_deact','5_groups_deact','10_groups_deact']

collectedAccuracies.to_csv('triangulate_Deact.csv',sep=';',decimal='.', index=False)