In [15]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np

# This cell drops irrelevant rows (i.e., warm up and syntax) and columns (beacons, layout),
# combines the activated and deactivated areas into one set, does the z transformation and
# outlier removal
# it also removes all columns that contain the name 'Unnamed'; these occur with a trailing ; in 
# each rows, which is interpreted as additional column

# Read in data and display first 5 rows
dataROI_in = pd.read_csv('fse17_roi_fse.csv',sep=';',decimal='.')
# fse17_roi_deact_fse.csv contains only the deactivated voxels
datadeact_in = pd.read_csv('fse17_roi_deact_fse.csv',sep=';',decimal='.')

# remove useless junk data
dataROI_in = dataROI_in[dataROI_in.task != 'W']
dataROI_in = dataROI_in[dataROI_in.task != 'Sy']
dataROI_in = dataROI_in.drop('beacon', axis = 1)
dataROI_in = dataROI_in.drop('layout', axis = 1)
# we do not need the first rest condition
dataROI_in = dataROI_in[dataROI_in.scan > 29]
#these are trailing rest conditions, i.e., they came after a syntax task, so we do not need them
dataROI_in = dataROI_in[dataROI_in.trial != 21]
dataROI_in = dataROI_in[dataROI_in.trial != 41]
dataROI_in = dataROI_in[dataROI_in.trial != 61]

datadeact_in = datadeact_in[datadeact_in.task != 'W']
datadeact_in = datadeact_in[datadeact_in.task != 'Sy']
datadeact_in = datadeact_in.drop('beacon', axis = 1)
datadeact_in = datadeact_in.drop('layout', axis = 1)
# we do not need the first rest condition
datadeact_in = datadeact_in[datadeact_in.scan > 29]
#these are trailing rest conditions, i.e., they came after a syntax task, so we do not need them
datadeact_in = datadeact_in[datadeact_in.trial != 21]
datadeact_in = datadeact_in[datadeact_in.trial != 41]
datadeact_in = datadeact_in[datadeact_in.trial != 61]

# #first, drop everything that should not be z-transformed and is not necessary (proband is necessary for grouping ) ===
dataROI = dataROI_in.drop('scan', axis = 1)
dataROI = dataROI.drop('trial', axis = 1)
dataROI = dataROI.drop('task', axis = 1)
dataROI = dataROI.drop('snippet', axis = 1)
dataROI = dataROI.drop('response', axis = 1)
dataROI = dataROI.drop('session', axis = 1)
# add the suffix _act to now that these are activated voxels
dataROI.rename(columns=lambda x: x + '_act', inplace=True)

datadeact = datadeact_in.drop('scan', axis = 1)
datadeact = datadeact.drop('trial', axis = 1)
datadeact = datadeact.drop('task', axis = 1)
datadeact = datadeact.drop('snippet', axis = 1)
datadeact = datadeact.drop('response', axis = 1)
datadeact = datadeact.drop('session', axis = 1)
datadeact = datadeact.drop('proband', axis = 1)
# add the suffix _deact to now that these are deactivated voxels
datadeact.rename(columns=lambda x: x + '_deact', inplace=True)

# combine both data sets and remove unnamed columns
data = pd.concat([dataROI, datadeact], axis = 1)
data.drop(list(data.filter(regex = 'Unnamed')),axis = 1, inplace = True)

#=== z transformation =========================================================================
z_data = data.groupby(['proband_act']).transform(lambda x: (x - x.mean())/ x.std())

#=== replace outliers with consecutive mean of voxels =========================================
z_data = z_data.mask(z_data.sub(z_data.mean()).div(z_data.std()).abs().gt(6))
z_data = z_data.interpolate()

#now add the columns again ====================================================================
z_data['proband'] = dataROI_in['proband']
z_data['scan'] = dataROI_in['scan']
z_data['trial'] = dataROI_in['trial']
z_data['task'] = dataROI_in['task']
z_data['snippet'] = dataROI_in['snippet']
z_data['response'] = dataROI_in['response']

# now recode the task (i.e., labels) to numeric values for tpot
z_data['task'] = z_data['task'].replace({'TD': 0, 'BU': 0, 'R': 1})

# manually reset the index to start with 0; pandas does not do that automatically when deleting
# rows; it needs to be done here, otherwise, it will not work correctly
z_data.reset_index(inplace=True, drop=True)

data = z_data


In [16]:
print(data.shape)
print(data.at[18, 'scan'])
print(len(data['scan']))

(11340, 2194)
48
11340


In [6]:
data.head(17)

Unnamed: 0,68-177-121_act,68-180-121_act,68-183-121_act,68-180-124_act,68-183-124_act,71-180-118_act,71-183-118_act,71-174-121_act,71-177-121_act,71-180-121_act,...,152-87-97_deact,152-90-115_deact,152-93-115_deact,152-96-115_deact,proband,scan,trial,task,snippet,response
0,-0.171649,-0.037922,0.266641,-0.416293,-0.680553,-0.158427,-0.479574,0.341564,0.112387,-0.107309,...,-0.53628,1.037689,0.547177,0.701071,ab87,30,2,0,BinaryToDecimal,-1.0
1,1.435034,1.034872,0.412115,0.416794,-0.106404,0.545897,-0.623283,1.345947,1.47763,0.747486,...,-1.737083,2.162167,0.996126,0.487603,ab87,31,2,0,BinaryToDecimal,-1.0
2,2.693671,1.265673,0.234637,1.113608,0.569434,0.307906,-0.869643,2.896715,2.508636,1.261014,...,0.187733,3.612992,3.6337,2.579592,ab87,32,2,0,BinaryToDecimal,-1.0
3,4.029201,2.594912,1.67483,2.916053,2.518725,1.549319,0.772751,4.110009,4.031271,3.012857,...,1.469261,3.666201,3.026918,2.021291,ab87,33,2,0,BinaryToDecimal,-1.0
4,3.366832,2.404715,1.709744,3.324869,2.975541,1.404594,0.316987,2.975458,3.150583,2.50908,...,0.603977,0.31405,-0.035054,0.195315,ab87,34,2,0,BinaryToDecimal,-1.0
5,0.911614,0.530531,0.804895,1.8905,1.803776,-0.727676,-0.886066,1.390944,1.111561,0.562227,...,0.487933,-0.927488,-0.417362,0.431773,ab87,35,2,0,BinaryToDecimal,-1.0
6,-0.201328,-0.223844,-0.00685,0.626119,0.608545,-0.811295,-0.619178,0.626006,0.432476,0.360716,...,1.126175,-0.970055,-0.950488,-0.372839,ab87,36,2,0,BinaryToDecimal,-1.0
7,-0.21212,-0.679034,-0.463639,0.064172,0.220564,-0.991396,-1.386996,0.328708,-0.059153,-0.663088,...,-0.685119,-0.689822,-0.852281,-0.849037,ab87,37,2,0,BinaryToDecimal,-1.0
8,-0.476528,-0.736734,-0.617841,0.058553,0.599158,-0.306368,-0.857325,0.264428,-0.465897,-1.144114,...,-1.43436,0.228916,-0.070128,-0.136382,ab87,38,2,0,BinaryToDecimal,-1.0
9,-0.562865,-0.713227,-1.144457,-0.183085,-0.280057,-0.277423,-0.898384,-0.686924,-0.630362,-0.815847,...,0.659476,0.228916,0.017558,0.267566,ab87,39,2,0,BinaryToDecimal,-1.0


In [7]:
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

In [17]:
# start with adding the aggregation columns to the data set

import numpy as np
import random

# get all comprehension trials (i.e., remove odd ones, they are rest)
trials = list(filter(lambda x : x % 2 ==0 , data['trial'].unique()))

numVariants = 10
numScansPerTrial = 15
numConditionsPerTrial = 2

In [18]:
len(trials)

27

In [19]:
# import pdb

# 2 groups
for i in range(numVariants):
    trials = list(filter(lambda x : x % 2 ==0 , data['trial'].unique()))
    random.shuffle(trials)
    trials = np.delete(trials,len(trials) - 1)
    tasksGroup1, tasksGroup2 = split_list(trials,2)
    colName = 'aggr_2_groups' + str(i)
    aggr_2_groups = []
    k = 0
    while k < len(data['trial']):
        if data.at[k,'trial'] in tasksGroup1:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_2_groups.append('first')
        elif data.at[k,'trial'] in tasksGroup2:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_2_groups.append('second')
        else: # when one task is removed, it is neither in group1 nor group2, but it needs to be assigned to one group
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_2_groups.append('first')
        k = k + numScansPerTrial*numConditionsPerTrial
    data[colName] = pd.Series(aggr_2_groups)

In [20]:
# 3 groups
for i in range(numVariants):
    trials = list(filter(lambda x : x % 2 ==0 , data['trial'].unique()))
    random.shuffle(trials)
    tasksGroup1, tasksGroup2, tasksGroup3 = split_list(trials,3)
    colName = 'aggr_3_groups' + str(i)
    aggr_3_groups = []
    k = 0
    while k < len(data['trial']):
        if data.at[k,'trial'] in tasksGroup1:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_3_groups.append('first')
        elif data.at[k,'trial'] in tasksGroup2:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_3_groups.append('second')
        elif data.at[k,'trial'] in tasksGroup3:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_3_groups.append('third')
        k = k + numScansPerTrial*numConditionsPerTrial
    data[colName] = pd.Series(aggr_3_groups)

In [21]:
# 9 groups
for i in range(numVariants):
    trials = list(filter(lambda x : x % 2 ==0 , data['trial'].unique()))
    random.shuffle(trials)
    tasksGroup1, tasksGroup2, tasksGroup3, tasksGroup4, tasksGroup5, tasksGroup6, tasksGroup7, tasksGroup8, tasksGroup9 = split_list(trials,9)
    colName = 'aggr_9_groups' + str(i)
    aggr_9_groups = []
    k = 0
    while k < len(data['trial']):
        if data.at[k,'trial'] in tasksGroup1:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('first')
        elif data.at[k,'trial'] in tasksGroup2:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('second')
        elif data.at[k,'trial'] in tasksGroup3:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('third')
        elif data.at[k,'trial'] in tasksGroup4:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('fourth')
        elif data.at[k,'trial'] in tasksGroup5:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('fifth')
        elif data.at[k,'trial'] in tasksGroup6:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('sixth')
        elif data.at[k,'trial'] in tasksGroup7:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('seventh')
        elif data.at[k,'trial'] in tasksGroup8:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('eightth')
        elif data.at[k,'trial'] in tasksGroup9:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_9_groups.append('nineth')
        k = k + numScansPerTrial*numConditionsPerTrial
    data[colName] = pd.Series(aggr_9_groups)

In [22]:
# 13 groups
for i in range(numVariants):
    trials = list(filter(lambda x : x % 2 ==0 , data['trial'].unique()))
    random.shuffle(trials)
    trials = np.delete(trials,len(trials) - 1)
    tasksGroup1, tasksGroup2, tasksGroup3, tasksGroup4, tasksGroup5, tasksGroup6, tasksGroup7, tasksGroup8, tasksGroup9, tasksGroup10, tasksGroup11, tasksGroup12, tasksGroup13 = split_list(trials,13)
    colName = 'aggr_13_groups' + str(i)
    aggr_13_groups = []
    k = 0
    while k < len(data['trial']):
        if data.at[k,'trial'] in tasksGroup1:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('first')
        elif data.at[k,'trial'] in tasksGroup2:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('second')
        elif data.at[k,'trial'] in tasksGroup3:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('third')
        elif data.at[k,'trial'] in tasksGroup4:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('fourth')
        elif data.at[k,'trial'] in tasksGroup5:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('fifth')
        elif data.at[k,'trial'] in tasksGroup6:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('sixth')
        elif data.at[k,'trial'] in tasksGroup7:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('seventh')
        elif data.at[k,'trial'] in tasksGroup8:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('eightth')
        elif data.at[k,'trial'] in tasksGroup9:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('nineth')
        elif data.at[k,'trial'] in tasksGroup10:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('tenth')
        elif data.at[k,'trial'] in tasksGroup11:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('eleventh')
        elif data.at[k,'trial'] in tasksGroup12:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('twelveth')
        elif data.at[k,'trial'] in tasksGroup13:
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('thirteenth')
        else: # when one task is removed, it is neither in group1 nor group2, but it needs to be assigned to one group
            for j in range(k,k+numScansPerTrial*numConditionsPerTrial):
                aggr_13_groups.append('fourth')
        k = k + numScansPerTrial*numConditionsPerTrial
    data[colName] = pd.Series(aggr_13_groups)

In [23]:
z_data.to_csv('fse17_act_deact_zscore_groups.csv',sep=';',decimal='.', index=False)