# First approach: generate test set in such a way that the percentage of test data is constant for every experiment (mri only, eeg only, mixed, mri+dti)

In [1]:
from utils import visualize, create_dataset_age, create_dataset_eeg, cv
from utils import create_dataset_mri, create_dataset_eeg
import pandas as pd

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def find_test_split(TEST_PERC=0.15, repetitions = 200):
    # Load the datasets
    data_dti = create_dataset_mri(DTI = True)
    data_mri = create_dataset_mri()
    data_eeg = create_dataset_eeg()
    data_eeg = data_eeg.rename(columns={'id': 'ID'})
    # Mixed dataset eeg + mri
    data_mixed = pd.merge(data_mri, data_eeg, on='ID', how='inner')
    behavioral = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv')
    # Extract the IDs
    id_mri = data_mri[['ID']]
    id_dti = data_dti[['ID']]
    id_eeg = data_eeg[['ID']]
    id_mixed = data_mixed[['ID']]
    id_behavioral = behavioral[['EID']]
    # Compute the target number of test samples
    target_mri = id_mri.shape[0]*TEST_PERC
    target_dti = id_dti.shape[0]*TEST_PERC
    target_eeg = id_eeg.shape[0]*TEST_PERC
    target_mixed = id_mixed.shape[0]*TEST_PERC
    # Delete indices that are not in MRI, DTI or EEG from behavioral
    union = set(id_mri.iloc[:,0]).union(set(id_dti.iloc[:,0])).union(set(id_eeg.iloc[:,0]))
    id_behavioral = pd.DataFrame(set(id_behavioral.iloc[:,0]).intersection(union))
    # Store lowest possible error and corresponding random_state
    error = float('inf')
    seed = 0
    for i in range(repetitions):
        # Extract potential test indices
        test_indices = id_behavioral.sample(frac=TEST_PERC, random_state = i)
        # Compute intersection with the three datasets
        common_mri = len(set(test_indices.iloc[:,0]).intersection(set(id_mri.iloc[:,0])))
        common_dti = len(set(test_indices.iloc[:,0]).intersection(set(id_dti.iloc[:,0])))
        common_eeg = len(set(test_indices.iloc[:,0]).intersection(set(id_eeg.iloc[:,0])))
        common_mixed = len(set(test_indices.iloc[:,0]).intersection(set(id_mixed.iloc[:,0])))
        cur_error = (abs(target_mri-common_mri)/id_mri.shape[0] + abs(target_dti-common_dti)/id_dti.shape[0]
                     + abs(target_eeg-common_eeg)/id_eeg.shape[0] + abs(target_mixed-common_mixed)/id_mixed.shape[0])
        if cur_error < error:
            error = cur_error
            seed = i
    selected_test_indices = id_behavioral.sample(frac=TEST_PERC, random_state = seed)
    return selected_test_indices
        
        

In [3]:
indices = find_test_split()

  exec(code_obj, self.user_global_ns, self.user_ns)
  if (await self.run_code(code, result,  async_=asy)):


## Verify if the intersections are good

In [2]:
TEST_PERC = 0.15
data_dti = create_dataset_mri(DTI = True)
data_mri = create_dataset_mri()
data_eeg = create_dataset_eeg()
data_eeg = data_eeg.rename(columns={'id': 'ID'})
# Mixed dataset eeg + mri
data_mixed = pd.merge(data_mri, data_eeg, on='ID', how='inner')
behavioral = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv')
# Extract the IDs
id_mri = data_mri[['ID']]
id_dti = data_dti[['ID']]
id_eeg = data_eeg[['ID']]
id_mixed = data_mixed[['ID']]
id_behavioral = behavioral[['EID']]
# Compute the target number of test samples
target_mri = id_mri.shape[0]*TEST_PERC
target_dti = id_dti.shape[0]*TEST_PERC
target_eeg = id_eeg.shape[0]*TEST_PERC
target_mixed = id_mixed.shape[0]*TEST_PERC

  if (await self.run_code(code, result,  async_=asy)):
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Write to disk
indices = set(indices.iloc[:,0])
indices = pd.DataFrame(indices)
indices.columns = ['ID']
indices.to_csv('test_indices.csv', index = False)

As can be seen from the following cells, the result is very precise!

In [6]:
print(target_mri)
print(len(set(id_mri.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

171.9
172


In [7]:
print(target_dti)
print(len(set(id_dti.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

125.69999999999999
123


In [8]:
print(target_eeg)
print(len(set(id_eeg.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

195.9
196


In [9]:
print(target_mixed)
print(len(set(id_mixed.iloc[:,0]).intersection(set(indices.iloc[:,0]))))

120.75
121


# Second approach: take a fixed test set in the intersection of MRI, EEG and DTI

In [47]:
data_dti = create_dataset_mri(DTI = True)
data_mri = create_dataset_mri()
data_eeg = create_dataset_eeg()
data_eeg = data_eeg.rename(columns={'id': 'ID'})

# Mixed dataset eeg + mri
data_mixed = pd.merge(data_mri, data_eeg, on=['ID', 'Age', 'DX_01_Cat', 'DX_01', 'DX_01_Sub'], how='inner')

# Extract the IDs
id_mri = data_mri[['ID']]
id_dti = data_dti[['ID']]
id_eeg = data_eeg[['ID']]
id_mixed = data_mixed[['ID']]


data_eeg.drop(columns=['DX_01_Cat', 'DX_01', 'DX_01_Sub'], inplace=True)
data_mri.drop(columns=['DX_01_Cat', 'DX_01', 'DX_01_Sub'], inplace=True)
data_mixed.drop(columns=['DX_01_Cat', 'DX_01', 'DX_01_Sub'], inplace=True)

In [48]:
data_mixed.dropna(axis = 0, inplace = True)
data_eeg.dropna(axis = 0, inplace = True)

In [49]:
total = pd.merge(data_mixed, data_dti, on=['ID', 'Age'], how='inner')

In [50]:
total.shape

(355, 824)

In [51]:
id_total = total[['ID']]

In [52]:
indices_test = id_total.sample(frac=0.48, random_state = 70)

In [53]:
indices_fixed.shape

(170, 1)

In [54]:
# Write to disk
indices_fixed.to_csv('test_IDS.csv', index=False)

In [56]:
data_mixed.isna().sum().sum()

0