In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tabulate import tabulate

In [27]:
# Small tool to calculate accuracies and std
accuracies = np.array([0.6107, 0.6700, 0.6784, 0.6855])
print(accuracies.mean())
print(accuracies.std())

0.6611499999999999
0.029639542843977868


# Data Splits Test Set 

In [2]:
holdout_data = pd.read_csv('/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/ADNI_holdout.csv', delimiter = ',')

In [3]:
# Calculate statistics for different tests sets 
women = holdout_data['Sex'] == 'F'
holdout_data_f = holdout_data[women]
holdout_data_m = holdout_data[~women]
print(len(holdout_data_m))
print(len(holdout_data_f))
print(len(holdout_data_f[holdout_data_f['Research Group'] == 'CN']))
print(len(holdout_data_m[holdout_data_m['Research Group'] == 'CN']))
# print(women)

96
88
56
56


In [4]:
print(len(holdout_data_m[holdout_data_m['Research Group'] == 'AD']))

40


In [5]:
# calculate age for different subgroups 
print('Minimum age is', holdout_data['Age'].min())
print('Maximum age is', holdout_data['Age'].max())

age_60_73 = (holdout_data['Age'] >= 60) &(holdout_data['Age'] < 73)
age_73_78 = (holdout_data['Age'] >= 73) &(holdout_data['Age'] < 78)
age_78_90 = (holdout_data['Age'] >= 78) &(holdout_data['Age'] < 90)

data_60_73 = holdout_data[age_60_73]
data_73_78 = holdout_data[age_73_78]
data_78_90 = holdout_data[age_78_90]

Minimum age is 60.0
Maximum age is 89.8


In [6]:
# print test scores for different test scores 
for test in ['MMSE Total Score', 'FAQ Total Score', 'TOTALMOD']: 
    print(data_78_90[test].mean(), data_78_90[test].std())

25.345454545454544 3.9497687781278294
9.232558139534884 9.901508324979307
18.18781818181818 12.125511473123453


In [7]:
# Create data files for different subgroups 
export_csv_w = holdout_data_f.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/holdout_splits/' + 'ADNI_holdout_f.csv', index = None, header = True)
export_csv_m = holdout_data_m.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/holdout_splits/' + 'ADNI_holdout_m.csv', index = None, header = True)

export_csv_60_73 = data_60_73.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/holdout_splits/' + 'ADNI_holdout_60_73.csv', index = None, header = True)
export_csv_73_78 = data_73_78.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/holdout_splits/' + 'ADNI_holdout_73_78.csv', index = None, header = True)
export_csv_78_90 = data_78_90.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/holdout_splits/' + 'ADNI_holdout_78_90.csv', index = None, header = True)

# AD vs HC

In [8]:
# Load all data 
all_data = pd.read_csv('/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/ADNI_all.csv', delimiter = ',')

In [9]:
# Define AD and HC groups 
is_AD = all_data['Research Group'] == 'AD'
is_HC = all_data['Research Group'] == 'CN'

data_AD = all_data[is_AD]
data_HC = all_data[is_HC]

In [10]:
# Calculate age for each group 
print('Mean age for AD is', data_AD['Age'].mean(), 'and std is', data_AD['Age'].std())
print('Mean age for HC is', data_HC['Age'].mean(), 'and std is', data_HC['Age'].std())

Mean age for AD is 75.90667903525059 and std is 7.469971603270488
Mean age for HC is 76.74521126760551 and std is 5.10488561561696


In [33]:
# Print # participants for each group 
print('Number of male for AD is', len(data_AD[data_AD['Sex'] == 'M']['SubjectID'].unique()), 'and female', len(data_AD[data_AD['Sex'] == 'F']['SubjectID'].unique()))
print('Number of male for HC is', len(data_HC[data_HC['Sex'] == 'M']['SubjectID'].unique()), 'and female', len(data_HC[data_HC['Sex'] == 'F']['SubjectID'].unique()))

Number of male for AD is 96 and female 89
Number of male for HC is 108 and female 102


In [12]:
# Compute test scores for each group 
data_tests = [[data_AD['MMSE Total Score'].mean(), data_AD['MMSE Total Score'].std(), data_HC['MMSE Total Score'].mean(), data_HC['MMSE Total Score'].std()],
             [data_AD['FAQ Total Score'].mean(), data_AD['FAQ Total Score'].std(), data_HC['FAQ Total Score'].mean(), data_HC['FAQ Total Score'].std()], 
             [data_AD['TOTALMOD'].mean(), data_AD['TOTALMOD'].std(), data_HC['TOTALMOD'].mean(), data_HC['TOTALMOD'].std()]]

In [13]:
tests = pd.DataFrame(data_tests, columns = ['AD mean', 'AD std', 'HC mean', 'HC std'], index = ['MMSE', 'FAQ', 'ADAS13'])
tests

Unnamed: 0,AD mean,AD std,HC mean,HC std
MMSE,21.598881,4.222051,29.139831,1.051137
FAQ,17.850704,7.121839,0.16,0.734492
ADAS13,30.601855,12.43274,8.882746,4.342299


# MCI Data

In [14]:
# Load MCI data and divide in groups 
data_mci_conv = pd.read_csv('/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/data_last_version/ADNI_MCI_only_conv.csv', delimiter = ',')
data_mci = pd.read_csv('/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/data_last_version/ADNI_mci_no_conv.csv', delimiter = ',')
data_mci['DXCONV'] = 0
data_mci['Conversion'] = 0

data_mci_conv = data_mci_conv[data_mci_conv['DXCONV'] != 2.0]
data_mci_conv = data_mci_conv.drop(columns = 'Unnamed: 0')
data_mci_conv['Conversion'] = 1

print(len(data_mci))
print(len(data_mci_conv))

709
542


In [15]:
# Print # participants in each separate group 
print(len(data_mci['SubjectID'].unique()))
print(len(data_mci_conv['SubjectID'].unique()))
print(len(data_mci) + len(data_mci_conv))

211
125
1251


In [16]:
# Print sizes subgroups 
print(data_mci_conv.shape)
print(data_mci.shape)

(542, 22)
(709, 22)
['SubjectID' 'Phase' 'Sex' 'Weight' 'Research Group' 'Visit'
 'Archive Date' 'Study Date' 'Age' 'MMSE Total Score'
 'GDSCALE Total Score' 'Global CDR' 'FAQ Total Score' 'NPI-Q Total Score'
 'Preprocessing' 'Image ID' 'session' 'path' 'TOTAL11' 'TOTALMOD' 'DXCONV'
 'Conversion']
['SubjectID' 'Phase' 'Sex' 'Weight' 'Research Group' 'Visit'
 'Archive Date' 'Study Date' 'Age' 'MMSE Total Score'
 'GDSCALE Total Score' 'Global CDR' 'FAQ Total Score' 'NPI-Q Total Score'
 'Preprocessing' 'Image ID' 'session' 'path' 'TOTAL11' 'TOTALMOD' 'DXCONV'
 'Conversion']


In [17]:
# Determine converters 
converter_ids = data_mci_conv[data_mci_conv['DXCONV'] == 1]
list_ids = list(converter_ids['SubjectID'])
converters = data_mci_conv[data_mci_conv.apply(lambda row: row['SubjectID'] in list_ids, axis=1)]
print('Total scans converters in dataset', len(converters), 'parts is', len(converters['SubjectID'].unique()))

Total scans converters in dataset 511 parts is 116


In [18]:
# Create total MCI set
total_mci = data_mci.append(converters)
print(total_mci['Conversion'].sum())

print('Total number MCI scans:', len(total_mci))
print('Total patients MCI:', len(total_mci['SubjectID'].unique()))

511
Total number MCI scans: 1220
Total patients MCI: 327


In [19]:
# Export dataset 
export_csv_mci_no_conv = total_mci.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/' + 'ADNI_mci_total.csv', index = None, header = True)

In [34]:
# Get age and test set scores 
print('Mean age for MCI is',total_mci['Age'].mean(), 'and std is', total_mci['Age'].std())
print('Number of male for MCI is', len(total_mci[total_mci['Sex'] == 'M']['SubjectID'].unique()), 'and female', len(total_mci[total_mci['Sex'] == 'F']['SubjectID'].unique()))

Mean age for MCI is 75.74327868852447 and std is 7.207435938090487
Number of male for MCI is 211 and female 116


In [21]:
data_tests = [[total_mci['MMSE Total Score'].mean(), total_mci['MMSE Total Score'].std()],
             [total_mci['FAQ Total Score'].mean(), total_mci['FAQ Total Score'].std()], 
             [total_mci['TOTALMOD'].mean(), total_mci['TOTALMOD'].std()]]

In [22]:
tests = pd.DataFrame(data_tests, columns = ['MCI mean', 'MCI std'], index = ['MMSE', 'FAQ', 'ADAS13'])
tests

Unnamed: 0,MCI mean,MCI std
MMSE,26.10535,3.151908
FAQ,6.545045,6.317796
ADAS13,20.546238,8.568647


## Statistics for conv vs no conv

In [23]:
# Rename variables
data_conv = converters
data_no_conv = data_mci

511 116
709 211


In [24]:
# Get statistics for MCI converters and non converters 
print('Mean age for MCI conv is', data_conv['Age'].mean(), 'and std is', data_conv['Age'].std())
print('Mean age for MCI no conv is', data_no_conv['Age'].mean(), 'and std is', data_no_conv['Age'].std())

Mean age for MCI conv is 75.48062622309199 and std is 7.191042657990322
Mean age for MCI no conv is 75.93258110014095 and std is 7.218369279253879


In [35]:
print('Number of male for MCI conv is', len(data_conv[data_conv['Sex'] == 'M']['SubjectID'].unique()), 'and female', len(data_conv[data_conv['Sex'] == 'F']['SubjectID'].unique()))
print('Number of male for MCI no conv is', len(data_no_conv[data_no_conv['Sex'] == 'M']['SubjectID'].unique()), 'and female', len(data_no_conv[data_no_conv['Sex'] == 'F']['SubjectID'].unique()))

Number of male for MCI conv is 72 and female 44
Number of male for MCI no conv is 139 and female 72


In [26]:
data_tests = [[data_conv['MMSE Total Score'].mean(), data_conv['MMSE Total Score'].std(), data_no_conv['MMSE Total Score'].mean(), data_no_conv['MMSE Total Score'].std()],
             [data_conv['FAQ Total Score'].mean(), data_conv['FAQ Total Score'].std(), data_no_conv['FAQ Total Score'].mean(), data_no_conv['FAQ Total Score'].std()], 
             [data_conv['TOTALMOD'].mean(), data_conv['TOTALMOD'].std(), data_no_conv['TOTALMOD'].mean(), data_no_conv['TOTALMOD'].std()]]

In [27]:
tests = pd.DataFrame(data_tests, columns = ['conv mean', 'conv std', 'no conv mean', 'no conv std'], index = ['MMSE', 'FAQ', 'ADAS13'])
tests

Unnamed: 0,conv mean,conv std,no conv mean,no conv std
MMSE,24.586275,3.284094,27.204255,2.540185
FAQ,10.455471,6.318799,3.440404,4.261681
ADAS13,25.112309,8.445865,17.255317,7.009778


In [28]:
# Export csv files 
export_csv_mci_no_conv = data_no_conv.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/' + 'ADNI_mci_no_conv.csv', index = None, header = True)
export_csv_mci_conv = data_conv.to_csv(r'/analysis/ritter/projects/AD/Budding_Spectral_Analysis/data/' + 'ADNI_mci_conv.csv', index = None, header = True)