In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np

# Read in data and display first 5 rows
z_data = pd.read_csv('esem_act_deact_zscore_groups.csv',sep=';',decimal='.')
z_data.drop(list(z_data.filter(regex = '_act')),axis = 1, inplace = True)
z_data.drop(list(z_data.filter(regex = 'aggr')),axis = 1, inplace = True)

#first, drop everything non-numeric ===========================================================
z_data = z_data.drop('scan', axis = 1)
z_data = z_data.drop('snippet', axis = 1)
z_data = z_data.drop('response', axis = 1)

#=== now create the different groups and compute the groupwise mean ============================
grouped = z_data.groupby(['proband', 'trial','task'])
groupedAgg = grouped.aggregate(np.mean)
#=== end aggregation ===========================================================================

labels = groupedAgg.index.get_level_values(level='task')

#=== creating training and validation set ======================================================
# Saving feature names for later use
feature_list = list(groupedAgg.columns)

# Convert to numpy array
features = np.array(groupedAgg)
labels = np.array(labels)

numLabels = 3 # num of different labels (comprehension and rest)
sizeTrainSet = 13 # num of participants used for training
sizeTestSet = 4 # num participants used for testing
 
# get the number of trials per participant
numTrials = len(groupedAgg.index.get_level_values(level='trial').unique())
numParticipants = sizeTrainSet + sizeTestSet


#get the rows excluding the last indexed (i.e., row with the index 587 is the last to include)
training_features = features[0:numTrials*sizeTrainSet]
training_features = np.array(training_features)
testing_features = features[numTrials*sizeTrainSet:numTrials*numParticipants]
testing_features = np.array(testing_features)

training_target = labels[0:numTrials*sizeTrainSet]
training_target = np.array(training_target)
testing_target = labels[numTrials*sizeTrainSet:numTrials*numParticipants]
testing_target = np.array(testing_target)
#=== end creating training and validation set ===================================================


In [2]:
print(numTrials*numParticipants)

print(training_features.shape)
print(testing_features.shape)
print(training_target.shape)
print(testing_target.shape)

labels

1020
(780, 3459)
(240, 3459)
(780,)
(240,)


array([0., 2., 1., ..., 0., 2., 1.])

In [3]:
from tpot import TPOTClassifier
fileNameTPot = 'Deact/tpot_mnist_pipeline_FineAverageParticipantSplit_Deact'

tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,n_jobs = 20)
tpot.fit(training_features, training_target)
print(tpot.score(testing_features, testing_target))
tpot.export(fileNameTPot + '.py')

Optimization Progress:  42%|████▏     | 50/120 [10:13<4:40:44, 240.64s/pipeline]

Generation 1 - Current best internal CV score: 0.8025641025641026


Optimization Progress:  62%|██████▎   | 75/120 [15:24<1:50:41, 147.59s/pipeline]

Generation 2 - Current best internal CV score: 0.8076923076923077


Optimization Progress:  83%|████████▎ | 100/120 [20:41<46:15, 138.79s/pipeline]  

Generation 3 - Current best internal CV score: 0.8076923076923077


Optimization Progress: 124pipeline [25:55, 191.54s/pipeline]                     

Generation 4 - Current best internal CV score: 0.8076923076923077


                                                            

Generation 5 - Current best internal CV score: 0.8141025641025641

Best pipeline: LogisticRegression(GaussianNB(input_matrix), C=10.0, dual=False, penalty=l2)
0.8


True

In [4]:
with open(fileNameTPot + '.py') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
# content = [x.strip() for x in content] 

#      or 'exported_pipeline = ' not in line
cleanedContent = []
for line in content:
    if 'tpot_data' not in line and 'training_target, testing_target' not in line:
        cleanedContent.append(line)

fileForLearning = fileNameTPot + '_cleaned.py'
with open(fileForLearning, 'w') as filehandle:  
    for line in cleanedContent:
        filehandle.write('%s\n' % line)

In [5]:
%run -i $fileForLearning

In [6]:
num_matches = 0;
for a, b in zip(testing_target, results):
    if a == b:
        num_matches = num_matches + 1
print('Number of matches:',num_matches,'(of',testing_target.size,')')

accuracy = num_matches/testing_target.size*100
print('Accuary: ',accuracy)


Number of matches: 192 (of 240 )
Accuary:  80.0


In [7]:
with open('listOfAccuracies.txt','a+') as f:
    f.write('%s\n' % 'Fine_Deact' + str(accuracy) + '\n')
