In [30]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
from tpot import TPOTClassifier

# Read in data
z_data = pd.read_csv('fse17_act_deact_zscore_groups.csv',sep=';',decimal='.')

# Remove irrelevant columns
z_data.drop(list(z_data.filter(regex = '_deact')),axis = 1, inplace = True)


#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('trial', axis = 1)
z_data = z_data.drop('response', axis = 1)
z_data = z_data.drop('snippet', axis = 1)

numLabels = 2 # num of different labels (comprehension and rest)
sizeTrainSet = 11 # num of participants used for training
sizeTestSet = 3 # num participants used for testing
numParticipants = sizeTrainSet + sizeTestSet

In [2]:
#group the data and get the labels after the grouping
#return the grouped data and the labels
def prepare(data,groupingColumns):
    grouped = data.groupby(groupingColumns)
    groupedAgg = grouped.aggregate(np.mean)
    labels = groupedAgg.index.get_level_values(level='task')
    return groupedAgg,labels

In [3]:
#split according to participants (first 12 and last 4)
#return all four sets (much like train_test_split from sklearn)
#TODO: introduce variation (see file coarseAverageParticipantSplit)

def split(features, labels, low, high):
    
    #=== creating training and validation set ======================================================
    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)
    labels = np.array(labels)

    #get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
    training_features = features[0:low]
    training_features = np.array(training_features)

    testing_features = features[low:high]
    testing_features = np.array(testing_features)

    training_target = labels[0:low]
    training_target = np.array(training_target)
    testing_target = labels[low:high]
    testing_target = np.array(testing_target)
    #=== end creating training and validation set ===================================================
    
    return training_features, testing_features, training_target, testing_target

In [4]:
# assumes that the column names of the structure xxx_number_xxx, e.g., aggr_13_groups0
def findIndices(name):
    numGroups = int(name.split('_')[1])
    low = sizeTrainSet*numGroups*numLabels
    high = numParticipants*numGroups*numLabels
    return low, high

In [34]:
colName = 'aggr_13_groups0'
columnsForGrouping = ['proband', 'task',colName]
features, labels = prepare(z_data,columnsForGrouping)
offset = z_data.columns.get_loc(colName)
low, high = findIndices(colName)
training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
#=============================================================================================================
print(offset)
print(low, high)
print(testing_features.shape)
print(features.shape)
print(labels.shape)
print(columnsForGrouping)
# features

422
286 364
(78, 390)
(364, 390)
(364,)
['proband', 'task', 'aggr_13_groups0']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,68-177-121_act,68-180-121_act,68-183-121_act,68-180-124_act,68-183-124_act,71-180-118_act,71-183-118_act,71-174-121_act,71-177-121_act,71-180-121_act,...,188-177-130_act,188-165-133_act,188-168-133_act,188-171-133_act,188-174-133_act,188-168-136_act,188-171-136_act,188-174-136_act,191-168-130_act,191-168-133_act
proband,task,aggr_13_groups0,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ab87,0,eightth,-0.091628,-0.281788,-0.182527,-0.166311,-0.118958,-0.125351,-0.287746,0.101655,-0.035068,-0.184434,...,0.743780,0.477535,0.591160,0.293833,0.271066,0.260337,-0.071768,0.082850,0.208056,0.683637
ab87,0,eleventh,0.539451,0.328748,-0.143597,0.584192,0.449952,0.449797,0.252958,0.511686,0.484109,0.557964,...,0.482630,0.304288,0.409621,0.494923,0.737658,0.717590,0.706776,0.667110,0.396914,0.335798
ab87,0,fifth,0.280249,0.074099,-0.014945,0.158840,0.068058,-0.012596,-0.308264,0.421223,0.378013,0.123488,...,0.102181,0.016480,0.177819,0.194745,0.282338,0.373031,0.230366,0.175766,-0.067923,0.241907
ab87,0,first,-0.137228,-0.284458,-0.277643,-0.249412,-0.393664,-0.177276,-0.185980,-0.005965,-0.042855,-0.120884,...,-0.108709,0.069928,0.198410,0.335255,0.234429,0.134803,0.134940,0.187372,0.105219,0.179252
ab87,0,fourth,0.065869,-0.178433,-0.278951,0.006189,0.029882,-0.049537,-0.176131,0.309637,0.147574,0.031403,...,0.600929,0.389877,0.450383,0.423211,0.509404,0.433237,0.301734,0.378737,0.474401,0.445122
ab87,0,nineth,0.277469,0.095169,0.173131,0.390289,0.465048,0.166675,-0.119229,0.531553,0.352368,0.212589,...,0.881629,0.702582,0.702925,0.514728,0.571811,0.394813,0.259908,0.516236,0.357959,0.574406
ab87,0,second,-0.019306,-0.186899,-0.336380,0.031832,0.010664,-0.054544,-0.202120,0.115360,-0.036239,-0.109637,...,0.376954,0.473849,0.650774,0.680738,0.643808,0.597616,0.606674,0.560794,0.439080,0.410559
ab87,0,seventh,0.308541,0.038164,-0.214396,0.114462,-0.051343,-0.244758,-0.519731,0.274507,0.365483,0.082978,...,-0.547577,-0.236795,-0.180673,0.045885,0.161742,0.180603,0.294083,0.358398,-0.279258,-0.232351
ab87,0,sixth,0.329704,0.496373,0.275014,0.533361,0.438054,0.344873,0.277442,0.394557,0.356935,0.602842,...,0.083247,0.070122,-0.140661,0.281860,0.538419,0.101671,0.508952,0.693486,-0.256359,-0.398231
ab87,0,tenth,0.164839,-0.057519,-0.084370,0.147580,0.091959,0.019482,-0.080930,0.181972,0.183037,-0.038881,...,0.257424,0.257532,0.501515,0.385414,0.400223,0.492947,0.378458,0.379112,0.174594,0.381265


In [8]:
import pdb

#define the columns to group by
groupingColumn1 = 'proband'
groupingColumn2 = 'task'
groupingColumn3 = ''

#define how many different variants of aggregation and aggregation levels
numVariationsPerAggLevel = 10
numAggLevels = 4
numAggregationVariants = numVariationsPerAggLevel * numAggLevels

#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')
print(offset)
collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):
    # group the data set
#     pdb.set_trace()
    #=============================================================================================================
    columnsForGrouping = [groupingColumn1,groupingColumn2,z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================
    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    print(offset)
    print(low, high)
    
    folder = 'Act/'
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'
    #=============================================================================================================
    # run t_pot
    # TODO: extract method
    tpot = TPOTClassifier(generations=5, population_size=5, verbosity=2, n_jobs=20,max_eval_time_mins=1)
    tpot.fit(training_features, training_target)
    print(tpot.score(testing_features, testing_target))
    tpot.export(folder + fileName)
#     #=============================================================================================================
    
    offset = offset + 1


392
392
44 56


  return f(*args, **kwds)
                                                                                                                       

Generation 1 - Current best internal CV score: 0.95


                                                                                                                       

Generation 2 - Current best internal CV score: 0.975


                                                                                                                       

Generation 3 - Current best internal CV score: 0.975


                                                                                                                       

Generation 4 - Current best internal CV score: 0.975


                                                                                                                       

Generation 5 - Current best internal CV score: 0.975


                                                          


Best pipeline: GaussianNB(input_matrix)
1.0
393
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 1.0


                                                                                                                       

Generation 2 - Current best internal CV score: 1.0


                                                                                                                       

Generation 3 - Current best internal CV score: 1.0


                                                                                                                       

Generation 4 - Current best internal CV score: 1.0


                                                                                                                       

Generation 5 - Current best internal CV score: 1.0


                                                                                                                       


Best pipeline: GradientBoostingClassifier(PCA(input_matrix, iterated_power=3, svd_solver=randomized), learning_rate=0.5, max_depth=6, max_features=0.2, min_samples_leaf=13, min_samples_split=13, n_estimators=100, subsample=0.8500000000000001)
1.0
394
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 0.975


                                                                                                                       

Generation 2 - Current best internal CV score: 0.975


                                                                                                                       

Generation 3 - Current best internal CV score: 0.975


                                                                                                                       

Generation 4 - Current best internal CV score: 0.975


                                                                                                                       

Generation 5 - Current best internal CV score: 0.975


                                                                                                                       


Best pipeline: DecisionTreeClassifier(input_matrix, criterion=entropy, max_depth=8, min_samples_leaf=1, min_samples_split=8)
0.9166666666666666
395
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 0.975


                                                                                                                       

Generation 2 - Current best internal CV score: 0.975


                                                                                                                       

Generation 3 - Current best internal CV score: 0.975


                                                                                                                       

Generation 4 - Current best internal CV score: 0.975


                                                                                                                       

Generation 5 - Current best internal CV score: 0.975


                                                                                                                       


Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=False)
1.0
396
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 0.975


                                                                                                                       

Generation 2 - Current best internal CV score: 0.975


                                                                                                                       

Generation 3 - Current best internal CV score: 0.975


                                                                                                                       

Generation 4 - Current best internal CV score: 0.975


                                                                                                                       

Generation 5 - Current best internal CV score: 0.975


                                                                                                                       


Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.8, min_samples_leaf=10, min_samples_split=8, n_estimators=100)
0.8333333333333334
397
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 1.0


                                                                                                                       

Generation 2 - Current best internal CV score: 1.0


                                                                                                                       

Generation 3 - Current best internal CV score: 1.0


                                                                                                                       

Generation 4 - Current best internal CV score: 1.0


                                                                                                                       

Generation 5 - Current best internal CV score: 1.0


                                                                                                                       


Best pipeline: LinearSVC(input_matrix, C=1.0, dual=False, loss=squared_hinge, penalty=l2, tol=1e-05)
1.0
398
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 0.95


                                                                                                                       

Generation 2 - Current best internal CV score: 0.95


                                                                                                                       

Generation 3 - Current best internal CV score: 1.0


                                                                                                                       

Generation 4 - Current best internal CV score: 1.0


                                                                                                                       

Generation 5 - Current best internal CV score: 1.0


                                                                                                                       


Best pipeline: RandomForestClassifier(PCA(input_matrix, iterated_power=7, svd_solver=randomized), bootstrap=False, criterion=entropy, max_features=0.6000000000000001, min_samples_leaf=7, min_samples_split=7, n_estimators=100)
1.0
399
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 1.0


                                                                                                                       

Generation 2 - Current best internal CV score: 1.0


                                                                                                                       

Generation 3 - Current best internal CV score: 1.0


                                                                                                                       

Generation 4 - Current best internal CV score: 1.0


                                                                                                                       

Generation 5 - Current best internal CV score: 1.0


                                                                                                                       


Best pipeline: DecisionTreeClassifier(input_matrix, criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=16)
0.6666666666666666
400
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 0.975


                                                                                                                       

Generation 2 - Current best internal CV score: 0.975


                                                                                                                       

Generation 3 - Current best internal CV score: 0.975


                                                                                                                       

Generation 4 - Current best internal CV score: 0.975


                                                                                                                       

Generation 5 - Current best internal CV score: 0.975


                                                                                                                       


Best pipeline: GaussianNB(input_matrix)
1.0
401
44 56


                                                                                                                       

Generation 1 - Current best internal CV score: 1.0


                                                                                                                       

Generation 2 - Current best internal CV score: 1.0


                                                                                                                       

Generation 3 - Current best internal CV score: 1.0


                                                                                                                       

Generation 4 - Current best internal CV score: 1.0


                                                                                                                       

Generation 5 - Current best internal CV score: 1.0


                                                                                                                       


Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.05, min_samples_leaf=4, min_samples_split=19, n_estimators=100)
1.0
402
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9714285714285715


                                                                                                                       


Best pipeline: LinearSVC(input_matrix, C=0.001, dual=True, loss=hinge, penalty=l2, tol=0.1)
1.0
403
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9714285714285715


                                                                                                                       


Best pipeline: GaussianNB(input_matrix)
1.0
404
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9857142857142858


                                                                                                                       


Best pipeline: LinearSVC(KNeighborsClassifier(input_matrix, n_neighbors=20, p=2, weights=uniform), C=15.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
1.0
405
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9857142857142858


                                                                                                                       


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=8, p=2, weights=distance)
1.0
406
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9857142857142858


                                                                                                                       


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=19, p=2, weights=distance)
1.0
407
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9857142857142858


                                                                                                                       


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=11, p=1, weights=distance)
1.0
408
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9571428571428571


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9571428571428571


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9571428571428571


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9571428571428571


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9571428571428571


                                                                                                                       


Best pipeline: BernoulliNB(input_matrix, alpha=100.0, fit_prior=True)
0.9444444444444444
409
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9714285714285715


                                                                                                                       


Best pipeline: BernoulliNB(input_matrix, alpha=10.0, fit_prior=True)
1.0
410
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9285714285714285


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9428571428571428


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9571428571428571


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9571428571428571


                                                                                                                       


Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=False)
1.0
411
66 84


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9714285714285715


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9857142857142858


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9857142857142858


                                                                                                                       


Best pipeline: ExtraTreesClassifier(BernoulliNB(RobustScaler(input_matrix), alpha=10.0, fit_prior=True), bootstrap=False, criterion=entropy, max_features=0.25, min_samples_leaf=17, min_samples_split=13, n_estimators=100)
1.0
412
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9349999999999999


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9349999999999999


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9349999999999999


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9349999999999999


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9349999999999999


                                                                                                                       


Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.05, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
0.9444444444444444
413
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8989473684210527


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9142105263157895


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9142105263157895


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9142105263157895


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9142105263157895


                                                                                                                       


Best pipeline: LinearSVC(input_matrix, C=0.001, dual=True, loss=hinge, penalty=l2, tol=0.01)
0.9444444444444444
414
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8939473684210526


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9042105263157897


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9042105263157897


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9042105263157897


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9042105263157897


                                                                                                                       


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=64, p=2, weights=uniform)
0.9629629629629629
415
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9144736842105263


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9144736842105263


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9347368421052632


                                                                                                                       


Best pipeline: KNeighborsClassifier(KNeighborsClassifier(input_matrix, n_neighbors=8, p=2, weights=distance), n_neighbors=43, p=2, weights=uniform)
0.9444444444444444
416
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9199999999999999


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9199999999999999


                                                                                                                       

Generation 3 - Current best internal CV score: 0.925


                                                                                                                       

Generation 4 - Current best internal CV score: 0.925


                                                                                                                       

Generation 5 - Current best internal CV score: 0.925


                                                                                                                       


Best pipeline: LogisticRegression(input_matrix, C=10.0, dual=False, penalty=l2)
1.0
417
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8939473684210526


                                                                                                                       

Generation 2 - Current best internal CV score: 0.945


                                                                                                                       

Generation 3 - Current best internal CV score: 0.945


                                                                                                                       

Generation 4 - Current best internal CV score: 0.945


                                                                                                                       

Generation 5 - Current best internal CV score: 0.945


                                                                                                                       


Best pipeline: DecisionTreeClassifier(LogisticRegression(input_matrix, C=5.0, dual=True, penalty=l2), criterion=entropy, max_depth=3, min_samples_leaf=10, min_samples_split=10)
0.9444444444444444
418
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9197368421052632


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9197368421052632


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9197368421052632


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9199999999999999


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9199999999999999


                                                          


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=4, p=1, weights=distance)
0.9629629629629629
419
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9347368421052632


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9347368421052632


                                                                                                                       


Best pipeline: LogisticRegression(input_matrix, C=10.0, dual=True, penalty=l2)
0.9629629629629629
420
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8942105263157896


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8942105263157896


                                                                                                                       

Generation 3 - Current best internal CV score: 0.8942105263157896


                                                                                                                       

Generation 4 - Current best internal CV score: 0.8942105263157896


                                                                                                                       

Generation 5 - Current best internal CV score: 0.8942105263157896


                                                                                                                       


Best pipeline: LinearSVC(input_matrix, C=1.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.001)
0.9629629629629629
421
198 252


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9247368421052633


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9247368421052633


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9247368421052633


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9247368421052633


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9400000000000001


                                                          


Best pipeline: LogisticRegression(CombineDFs(input_matrix, OneHotEncoder(BernoulliNB(input_matrix, alpha=0.1, fit_prior=True), minimum_fraction=0.05, sparse=False)), C=10.0, dual=False, penalty=l2)
0.9259259259259259
422
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8852216748768473


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8852216748768473


                                                                                                                       

Generation 3 - Current best internal CV score: 0.8852216748768473


                                                                                                                       

Generation 4 - Current best internal CV score: 0.8852216748768473


                                                                                                                       

Generation 5 - Current best internal CV score: 0.8852216748768473


                                                          


Best pipeline: GaussianNB(input_matrix)
0.8589743589743589
423
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8708128078817733


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8708128078817733


                                                                                                                       

Generation 3 - Current best internal CV score: 0.8708128078817733


                                                                                                                       

Generation 4 - Current best internal CV score: 0.8959359605911329


                                                                                                                       

Generation 5 - Current best internal CV score: 0.8959359605911329


                                                                                                                       


Best pipeline: LogisticRegression(input_matrix, C=1.0, dual=False, penalty=l1)
0.8846153846153846
424
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.882512315270936


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8885467980295567


                                                                                                                       

Generation 3 - Current best internal CV score: 0.8885467980295567


                                                                                                                       

Generation 4 - Current best internal CV score: 0.8955665024630541


                                                                                                                       

Generation 5 - Current best internal CV score: 0.8955665024630541


                                                                                                                       


Best pipeline: LinearSVC(input_matrix, C=20.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.1)
0.9358974358974359
425
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9273399014778325


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9273399014778325


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9273399014778325


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9307881773399014


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9307881773399014


                                                                                                                       


Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l2)
0.9230769230769231
426
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.902832512315271


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9062807881773398


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9098522167487685


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9167487684729064


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9167487684729064


                                                                                                                       


Best pipeline: KNeighborsClassifier(RFE(SelectPercentile(input_matrix, percentile=92), criterion=entropy, max_features=0.5, n_estimators=100, step=0.7500000000000001), n_neighbors=61, p=2, weights=distance)
0.9487179487179487
427
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8958128078817735


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8958128078817735


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9064039408866995


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9064039408866995


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9064039408866995


                                                                                                                       


Best pipeline: KNeighborsClassifier(LogisticRegression(input_matrix, C=0.001, dual=False, penalty=l2), n_neighbors=6, p=1, weights=distance)
0.8974358974358975
428
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8922413793103449


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8923645320197044


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9064039408866995


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9064039408866995


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9097290640394089


                                                                                                                       


Best pipeline: LogisticRegression(GaussianNB(DecisionTreeClassifier(input_matrix, criterion=entropy, max_depth=6, min_samples_leaf=16, min_samples_split=2)), C=5.0, dual=False, penalty=l2)
0.9358974358974359
429
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9131773399014778


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9131773399014778


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9131773399014778


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9167487684729064


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9167487684729064


                                                                                                                       


Best pipeline: LogisticRegression(input_matrix, C=0.5, dual=True, penalty=l2)
0.9615384615384616
430
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.9199507389162562


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9233990147783253


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9269704433497538


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9269704433497538


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9304187192118226


                                                                                                                       


Best pipeline: LinearSVC(MaxAbsScaler(input_matrix), C=15.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)
0.9230769230769231
431
286 364


                                                                                                                       

Generation 1 - Current best internal CV score: 0.8995073891625616


                                                                                                                       

Generation 2 - Current best internal CV score: 0.9066502463054187


                                                                                                                       

Generation 3 - Current best internal CV score: 0.9066502463054187


                                                                                                                       

Generation 4 - Current best internal CV score: 0.9066502463054187


                                                                                                                       

Generation 5 - Current best internal CV score: 0.9066502463054187


                                                                                                                       


Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=20, p=2, weights=uniform)
0.9743589743589743


In [9]:
#get the index of the starting column that contains the labels of the groupings
offset = z_data.columns.get_loc('aggr_2_groups0')
print(offset)
collectAccuracies = []

#collect all instances where unmatched is not possible.
unmatched = []

#loop through the columns
#for each column, group the data set
# transform it to numpy.ndarray and split into training and validation set
# run t_pot
# and do the learning
while offset < len(z_data.columns):    

    #=============================================================================================================
    columnsForGrouping = ['proband','task',z_data.columns[offset]]
    features, labels = prepare(z_data,columnsForGrouping)
    #=============================================================================================================
    #=============================================================================================================
    # transform to numpy.ndarray and do split
    # find low and high
    low, high = findIndices(z_data.columns[offset])
    training_features, testing_features, training_target, testing_target = split(features, labels, low, high)
    #=============================================================================================================
    
    fileName = 'tpot_mnist_pipeline_triangulateAggregationLevelParticipantSplit' + z_data.columns[offset] + '.py'

    #=============================================================================================================
    #do the learning
    # TODO: extract method
    with open(folder + fileName) as f:
        content = f.readlines()
    cleanedContent = []
    for line in content:
        if 'tpot_data' not in line and 'training_target, testing_target' not in line:
            cleanedContent.append(line)
    
    fileNameCleaned = folder + 'cleaned_' + fileName
    with open(fileNameCleaned, 'w') as filehandle:  
        for line in cleanedContent:
            filehandle.write('%s\n' % line)
            
    %run -i $fileNameCleaned
    
    # print the accuracy
    # TODO: extract method
    # TODO: print all accurracy values to one file
    num_matches = 0;
    for a, b in zip(testing_target, results):
        if a == b:
            num_matches = num_matches + 1
        else:
            unmatched.append((a,b))
    print('Number of matches:',num_matches,'(of',testing_target.size,')')

    accuracy = num_matches/testing_target.size*100
    print('Accuary: ',accuracy)
    #=============================================================================================================
    
    #=============================================================================================================
    #collect all accuracy values to plot them
    collectAccuracies.append(accuracy)
    
    
    offset = offset + 1
    

392
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 11 (of 12 )
Accuary:  91.66666666666666
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 9 (of 12 )
Accuary:  75.0
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 8 (of 12 )
Accuary:  66.66666666666666
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 12 (of 12 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 17 (of 18 )
Accuary:  94.44444444444444
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 18 (of 18 )
Accuary:  100.0
Number of matches: 51 (

In [10]:
len(unmatched)

92

In [11]:
countComprehensionRest = 0
countComprehensionSyntax = 0
countRestSyntax = 0
for x in unmatched:
    if x == (0,1) or x == (1,0):
        countComprehensionRest = countComprehensionRest + 1
    if x == (0,2) or x == (2,0):
        countComprehensionSyntax = countComprehensionSyntax + 1
    if x == (1,2) or x == (2,1):
        countRestSyntax = countRestSyntax + 1
print('Comprehension/Rest',countComprehensionRest)
print('Comprehension/Syntax',countComprehensionSyntax)
print('Rest/Syntax',countRestSyntax)

Comprehension/Rest 92
Comprehension/Syntax 0
Rest/Syntax 0


In [12]:
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

In [13]:
list1, list2, list3, list4 = split_list(collectAccuracies,4)
frames = [list1, list2, list3, list4]
collectedAccuracies = pd.DataFrame(frames).T
collectedAccuracies.columns = ['2_groups_act','3_groups_act','9_groups_act','13_groups_act']


collectedAccuracies.to_csv('triangulate_Act.csv',sep=';',decimal='.', index=False)