# First approach: generate test set in such a way that the percentage of test data is constant for every experiment (mri only, eeg only, mixed, mri+dti)

In [116]:
from utils import visualize, create_dataset_age, create_dataset_eeg, cv
from utils import create_dataset_mri, create_dataset_eeg
import pandas as pd

In [117]:
def find_test_split(TEST_PERC=0.15, repetitions = 200):
    # Load the datasets
    data_dti = create_dataset_mri(DTI = True)
    data_mri = create_dataset_mri()
    data_eeg = create_dataset_eeg()
    data_eeg = data_eeg.rename(columns={'id': 'ID'})
    # Mixed dataset eeg + mri
    data_mixed = pd.merge(data_mri, data_eeg, on='ID', how='inner')
    behavioral = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv')
    # Extract the IDs
    id_mri = data_mri[['ID']]
    id_dti = data_dti[['ID']]
    id_eeg = data_eeg[['ID']]
    id_mixed = data_mixed[['ID']]
    id_behavioral = behavioral[['EID']]
    # Compute the target number of test samples
    target_mri = id_mri.shape[0]*TEST_PERC
    target_dti = id_dti.shape[0]*TEST_PERC
    target_eeg = id_eeg.shape[0]*TEST_PERC
    target_mixed = id_mixed.shape[0]*TEST_PERC
    # Delete indices that are not in MRI, DTI or EEG from behavioral
    union = set(id_mri.iloc[:,0]).union(set(id_dti.iloc[:,0])).union(set(id_eeg.iloc[:,0]))
    id_behavioral = pd.DataFrame(set(id_behavioral.iloc[:,0]).intersection(union))
    # Store lowest possible error and corresponding random_state
    error = float('inf')
    seed = 0
    for i in range(repetitions):
        # Extract potential test indices
        test_indices = id_behavioral.sample(frac=TEST_PERC, random_state = i)
        # Compute intersection with the three datasets
        common_mri = len(set(test_indices.iloc[:,0]).intersection(set(id_mri.iloc[:,0])))
        common_dti = len(set(test_indices.iloc[:,0]).intersection(set(id_dti.iloc[:,0])))
        common_eeg = len(set(test_indices.iloc[:,0]).intersection(set(id_eeg.iloc[:,0])))
        common_mixed = len(set(test_indices.iloc[:,0]).intersection(set(id_mixed.iloc[:,0])))
        cur_error = (abs(target_mri-common_mri)/id_mri.shape[0] + abs(target_dti-common_dti)/id_dti.shape[0]
                     + abs(target_eeg-common_eeg)/id_eeg.shape[0] + abs(target_mixed-common_mixed)/id_mixed.shape[0])
        if cur_error < error:
            error = cur_error
            seed = i
    selected_test_indices = id_behavioral.sample(frac=TEST_PERC, random_state = seed)
    return selected_test_indices
        
        

In [118]:
indices = find_test_split()

## Verify if the intersections are good

In [119]:
TEST_PERC = 0.15
data_dti = create_dataset_mri(DTI = True)
data_mri = create_dataset_mri()
data_eeg = create_dataset_eeg()
data_eeg = data_eeg.rename(columns={'id': 'ID'})
# Mixed dataset eeg + mri
data_mixed = pd.merge(data_mri, data_eeg, on='ID', how='inner')
behavioral = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv')
# Extract the IDs
id_mri = data_mri[['ID']]
id_dti = data_dti[['ID']]
id_eeg = data_eeg[['ID']]
id_mixed = data_mixed[['ID']]
id_behavioral = behavioral[['EID']]
# Compute the target number of test samples
target_mri = id_mri.shape[0]*TEST_PERC
target_dti = id_dti.shape[0]*TEST_PERC
target_eeg = id_eeg.shape[0]*TEST_PERC
target_mixed = id_mixed.shape[0]*TEST_PERC

In [120]:
# Write to disk
indices = set(indices.iloc[:,1])
indices = pd.DataFrame(indices)
indices.columns = ['ID']
indices.to_csv('test_indices.csv')

IndexError: single positional indexer is out-of-bounds

As can be seen from the following cells, the result is very precise!

In [112]:
print(target_mri)
print(len(set(id_mri.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

171.9
172


In [113]:
print(target_dti)
print(len(set(id_dti.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

125.69999999999999
125


In [114]:
print(target_eeg)
print(len(set(id_eeg.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

195.9
195


In [115]:
print(target_mixed)
print(len(set(id_mixed.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

120.75
120


# Second approach: take a fixed test set in the intersection of MRI and EEG

In [63]:
id_mixed = pd.DataFrame(id_mixed)
indices_fixed = id_mixed.sample(frac=TEST_PERC, random_state = 70)

In [64]:
print(target_dti)
print(len(set(id_dti.iloc[:,0]).intersection(set(indices_fixed.iloc[:,0]))))

125.69999999999999
89


In [65]:
# The intersection with the DTI dataset is still reasonable with this random seed.

In [66]:
# Write to disk
indices_fixed.to_csv('test_indices_fixed.csv', index=False)