In [1]:
import pandas as pd
import numpy as np

## Load full data (bivariate)

In [5]:
path = 'bivariate_study'
data = '/bivariate_data'

times = [time[0] for time in pd.read_csv(path+'/output/LongitudinalMetricModel_absolute_times.txt', header=None).values]
memory = [score[0] for score in pd.read_csv(path+data+'/Y.csv', header=None).values]
hippocampus = [score[1] for score in pd.read_csv(path+data+'/Y.csv', header=None).values]
ids = [int(idx[0]) for idx in pd.read_csv(path+'/output/LongitudinalMetricModel_subject_ids.txt', header=None).values]

In [67]:
data = pd.DataFrame(index=[ids, times], columns=['memory', 'hippocampus'])
data['memory'] = memory
data['hippocampus'] = hippocampus
data.index.set_names(['ID','TIME'], inplace=True)
data['ID'] = data.index.get_level_values(0)
data['TIME'] = data.index.get_level_values(1)
data = data.set_index('ID')
data

Unnamed: 0_level_0,memory,hippocampus,TIME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.407333,0.377389,74.950188
4,0.518444,0.375055,75.416328
4,0.466667,0.364087,75.882462
4,0.540667,0.382969,76.348595
4,0.511111,0.429288,77.747002
...,...,...,...
1425,0.540667,0.242699,82.017731
1425,0.607333,0.267690,82.495964
1430,0.577778,0.745385,82.255379
1430,0.659333,0.774981,82.717415


## Keep only the cognitive baseline

In [70]:
data_baseline = data.copy()

for sub in data.index.unique():
    sub_data = data.loc[sub]
    for visit in range(len(sub_data)):
        if visit:
            data_baseline['memory'].loc[sub].iloc[visit] = np.nan
    data_baseline.loc[sub] = sub_data.copy()

In [71]:
data_baseline

Unnamed: 0_level_0,memory,hippocampus,TIME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.407333,0.377389,74.950188
4,0.518444,0.375055,75.416328
4,0.466667,0.364087,75.882462
4,0.540667,0.382969,76.348595
4,0.511111,0.429288,77.747002
...,...,...,...
1425,0.540667,0.242699,82.017731
1425,0.607333,0.267690,82.495964
1430,0.577778,0.745385,82.255379
1430,0.659333,0.774981,82.717415


In [91]:
# try to evaluate individual parameters from only the imaging data with only the baseline cognitive scores

data_baseline = data.copy()

for sub in data.index.unique(): 
    data_sub = data.loc[sub].copy()
    if type(data_sub) == pd.core.series.Series:
        continue
    for ses in range(len(data_sub)):
        if ses > 0:
            data_sub['memory'].iloc[ses] = np.nan
    data_baseline.loc[sub] = data_sub


In [92]:
data_baseline

Unnamed: 0_level_0,memory,hippocampus,TIME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.407333,0.377389,74.950188
4,,0.375055,75.416328
4,,0.364087,75.882462
4,,0.382969,76.348595
4,,0.429288,77.747002
...,...,...,...
1425,,0.242699,82.017731
1425,,0.267690,82.495964
1430,0.577778,0.745385,82.255379
1430,,0.774981,82.717415


In [93]:
Y = data_baseline[['hippocampus', 'memory']].values

In [86]:
np.savetxt('Y_baseline.csv', Y, delimiter=',')

In [95]:
# try to evaluate individual parameters from only the imaging data with only the baseline cognitive scores

data_baseline = data.copy()

for sub in data.index.unique(): 
    data_sub = data.loc[sub].copy()
    if type(data_sub) == pd.core.series.Series:
        continue
    for ses in range(len(data_sub)):
        if ses > 0:
            data_sub['memory'].iloc[ses] = data_sub['memory'].iloc[0]
    data_baseline.loc[sub] = data_sub


In [96]:
Y = data_baseline[['hippocampus', 'memory']].values

In [97]:
np.savetxt('Y_baseline_duplicated.csv', Y, delimiter=',')

## Load the simulated_data

In [32]:
path = 'simulated_study/'
data = 'simulated_data_5/'
output_folder = 'output_5/'

times = [time[0] for time in pd.read_csv(path+output_folder+'LongitudinalMetricModel_absolute_times.txt', header=None).values]
logistic = [score[0] for score in pd.read_csv(path+data+'/Y.csv', header=None).values]
sum_logistic = [score[1] for score in pd.read_csv(path+data+'/Y.csv', header=None).values]
ids = [int(idx[0]) for idx in pd.read_csv(path+output_folder+'/LongitudinalMetricModel_subject_ids.txt', header=None).values]

In [33]:
data = pd.DataFrame(index=[ids, times], columns=['logistic', 'sum_logistic'])
data['logistic'] = logistic
data['sum_logistic'] = sum_logistic
data.index.set_names(['ID','TIME'], inplace=True)
data['ID'] = data.index.get_level_values(0)
data['TIME'] = data.index.get_level_values(1)
data = data.set_index('ID')
data

Unnamed: 0_level_0,logistic,sum_logistic,TIME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.825440,0.758157,96.094437
0,0.694294,0.864409,96.597565
0,0.733746,0.803578,97.100685
0,0.572616,0.974391,97.603806
0,0.807546,0.816188,98.106934
...,...,...,...
99,0.740701,0.581178,75.555489
99,0.596949,0.658479,75.777214
99,0.633617,0.734469,75.998947
99,0.524243,0.633501,76.220680


## Delete random visits

In [34]:
data_pruned = data.copy()
delete_n_sessions = 2

for sub in data.index.unique():
    sub_data = data.loc[sub]
    for i in range(delete_n_sessions):
        deleted_visit = np.random.randint(len(sub_data))
        # For some reason, condensing the two following lines doesn't work (it then copies the df)
        sub_data['logistic'].iloc[deleted_visit] = np.nan
        sub_data['sum_logistic'].iloc[deleted_visit] = np.nan
    data_pruned.loc[sub] = sub_data.copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data['logistic'].iloc[deleted_visit] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data['sum_logistic'].iloc[deleted_visit] = np.nan


In [35]:
data_pruned

Unnamed: 0_level_0,logistic,sum_logistic,TIME
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.825440,0.758157,96.094437
0,0.694294,0.864409,96.597565
0,0.733746,0.803578,97.100685
0,0.572616,0.974391,97.603806
0,,,98.106934
...,...,...,...
99,,,75.555489
99,0.596949,0.658479,75.777214
99,0.633617,0.734469,75.998947
99,,,76.220680


In [36]:
Y = data_pruned[['logistic', 'sum_logistic']].values

In [37]:
np.savetxt('Y_pruned.csv', Y, delimiter=',')