-
Notifications
You must be signed in to change notification settings - Fork 1
/
ADvsHC_matching.py
320 lines (267 loc) · 14.2 KB
/
ADvsHC_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Non-Profit Open Software License 3.0 (NPOSL-3.0)
Copyright (c) 2023 Matteo Lai
@authors:
Matteo Lai, M.Sc, Ph.D. student
Dept. of Electrical, Electronic and Information Engineering – DEI "Guglielmo Marconi",
University of Bologna, Bologna, Italy.
Email address: matteo.lai3@unibo.it
Chiara Marzi, Ph.D., Junior Assistant Professor
Department of Statistics, Computer Science, Applications "G. Parenti" (DiSIA)
University of Firenze, Firenze, Italy
Email address: chiara.marzi@unifi.it
Luca Citi, Ph.D., Professor
School of Computer Science and Electronic Engineering
University of Essex, Colchester, UK
Email address: lciti@essex.ac.uk
Stefano Diciotti, Ph.D, Associate Professor
Dept. of Electrical, Electronic and Information Engineering – DEI "Guglielmo Marconi",
University of Bologna, Bologna, Italy.
Email address: stefano.diciotti@unibo.it
"""
import pandas as pd
import numpy as np
import os
from pathlib import Path
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
np.random.seed(10)
lab_path = 'data\ADNI\ADNI_T1w_axslicez127_list.txt'
path_img = 'data\ADNI\ADNI_T1w_reg2std_axslicez127.nii.gz'
path_matching = 'data/ADNI/matching/all_data/'
path_txt_img = path_matching + 'ADNI_T1w_axslicez127_matched_list.txt'
# lab_path = 'data\ADNI\ADNI_T1w_clean_axslicez127_list.txt'
# path_img = 'data\ADNI\ADNI_T1w_reg2std_axslicez127_clean.nii.gz'
# path_matching = 'data/ADNI/matching/clean_data/'
# path_txt_img = path_matching + 'ADNI_T1w_axslicez127_clean_matched_list.txt'
path_csv = 'data/ADNI/ADNI.csv'
path_removed = path_matching + 'ADNI_removed.csv'
path_matched = path_matching + 'ADNI_matched.csv'
path_count_matched = path_matching + 'ADNI_count_matched.csv'
path_txt = path_matching + 'output_matching.txt'
img_path_before = path_matching + 'Hist_Before_Matching.JPG'
img_path_after = path_matching + 'Hist_After_Matching.JPG'
p_thresholdAD = 0.05
p_thresholdCN = 0.05
itermax = 100
# ----------------------------------------------
min_age_AD = 49
max_age_AD = 60
nM_R_AD = 1
nF_R_AD = 1
min_age_CN = 73
max_age_CN = 83
nM_R_CN = 0
nF_R_CN = 2
# ----------------------------------------------
if not os.path.exists(path_matching):
os.makedirs(path_matching)
if os.path.exists(path_txt):
os.remove(path_txt)
f = open(path_txt, "a")
############################################################################################################
# Create a dataframe with the 'Image Data ID' ordered coherently with the images in the nifti file:
############################################################################################################
# The file at 'lab_path' contains the name of the nifti images from where has been extracted the slices.
# The order of the paths in this file is coherent with the order of the slices saved in 'path_img'.
with open(lab_path) as l:
labels_txt = l.readlines()
# Create a dataframe that contains the 'Image Data ID' of each image, and the associated path.
lab_df = pd.DataFrame(columns=['ID','path'], index=range(len(labels_txt)))
for i, lab_p in enumerate(labels_txt):
# Example of lab_p: 'ADNI_002_S_0295_MR_MP-RAGE__br_raw_20060418193713091_1_S13408_I13722_reg2MNI_dof9_conform_reoriented_axslicez127.nii.gz\n'
p = Path(lab_p).stem
parts = p.split('_') # ['ADNI', '002', 'S', '0295', 'MR', 'MP-RAGE', '', 'br', 'raw', '20060418193713091', '1', 'S13408', 'I13722', 'reg2MNI', ...]
lab_df['ID'][i] = parts[-6] # 'I13722'
lab_df['path'][i] = lab_p
# Load the dataframe with some metadata:
ADNI_df = pd.read_csv(path_csv)
# Merge 'lab_df' and 'ADNI_df' in order to re-order the lines with the same order of the slices saved in 'path_img'
merged_df = lab_df.merge(ADNI_df, left_on=lab_df['ID'], right_on=ADNI_df['Image Data ID'], how='left')
# Note: To verify that the condition 'merged_df['path'] == labels_txt' is verified for every line:
# sum(merged_df['path'] == labels_txt) == len(labels_txt)
############################################################################################################
# Check age and sex matching:
############################################################################################################
AD_df = merged_df[merged_df['Group']=='AD']
CN_df = merged_df[merged_df['Group']=='CN']
# Visualize the distribution of the age to decide in which range remove controls:
age_AD = AD_df['Age']
age_CN = CN_df['Age']
print('\nLegend:', file=f)
print('T: p-value of T-test - to test the AGE matching', file=f)
print('MW: p-value of Mann-Withney U test - to test the AGE matching', file=f)
print('chi: p-value of the chi-square test - to test the SEX matching', file=f)
print('---------------------------------------------------------------', file=f)
####################################################################################################################
# REMOVE AD:
####################################################################################################################
plt.close('all')
# sns.kdeplot(age_AD, shade=True, legend=True, color="red", cmap="Reds", common_norm=False)
# sns.kdeplot(age_CN, shade=True, legend=True, cmap="Blues", common_norm=False)
sns.histplot(data=age_AD, binwidth=1, kde=True, legend=True, color="red")
sns.histplot(data=age_CN, binwidth=1, kde=True, legend=True, color="blue")
handles = [mpatches.Patch(facecolor=plt.cm.Reds(100), label='AD'),
mpatches.Patch(facecolor=plt.cm.Blues(100), label='CN')]
plt.title('Age distribution before matching')
# plt.title('Decide the range from where remove AD')
plt.legend(loc='best', handles=handles)
plt.xlim(0,100)
plt.ylim(0,max(CN_df.groupby(['Age']).count()['ID'])+5)
plt.savefig(img_path_before)
#plt.show()
print('\nSet the range of age in which remove AD patients:', file=f)
# min_age = input('Min: ')
# max_age = input('Max: ')
print('[{}, {}]'.format(min_age_AD,max_age_AD), file=f)
nM_AD = sum(AD_df['Sex']=='M')
nM_CN = sum(CN_df['Sex']=='M')
nF_AD = sum(AD_df['Sex']=='F')
nF_CN = sum(CN_df['Sex']=='F')
print('\nNumber of AD males: {}\nNumber of AD females: {}'.format(nM_AD, nF_AD), file=f)
print('Number of CN males: {}\nNumber of CN females: {}'.format(nM_CN, nF_CN), file=f)
# nM_R = input('Number of AD males to remove at each iteration: ')
# nF_R = input('Number of AD females to remove at each iteration: ')
print('\nIn each iteration remove {} M and {} F from AD.'.format(nM_R_AD, nF_R_AD), file=f)
tStat, T_pValue = ttest_ind(age_AD, age_CN, equal_var = False)
MW = mannwhitneyu(age_AD, age_CN)
MW_pValue = MW.pvalue
print('\nOriginal: T = {}'.format(T_pValue), file=f)
print(' MW = {}'.format(MW_pValue), file=f)
contigency_train = pd.crosstab(merged_df['Sex'], merged_df['Group'])
c, pvalue_sex_matched, dof, expected = chi2_contingency(contigency_train)
print(' chi2 = {}\n'.format(pvalue_sex_matched), file=f)
new_df = merged_df.copy()
rm_df = pd.DataFrame(columns=merged_df.columns)
# REMOVE AD PATIENTS:
i=0
while (min(T_pValue,MW_pValue)<p_thresholdAD) & (i<itermax):
# Separate males and females:
df_M = AD_df.query("Sex=='M'")
df_F = AD_df.query("Sex=='F'")
# Extract patients with AGE in the range [min_age, max_age]:
df_M_selected = df_M.query("{} < Age < {}".format(min_age_AD, max_age_AD))
df_F_selected = df_F.query("{} < Age < {}".format(min_age_AD, max_age_AD))
# Select the patients to be removed and memorize them
df_M_removed = df_M_selected.sample(n=int(nM_R_AD))
df_F_removed = df_F_selected.sample(n=int(nF_R_AD))
rm_df = pd.concat([rm_df,df_M_removed])
rm_df = pd.concat([rm_df,df_F_removed])
# Remove the selected subject from the CONT of the GROUP of interest:
rm_id = pd.DataFrame(rm_df['ID']).rename(columns={"ID": "ID_rm"})
new_df = merged_df.merge(rm_id, left_on='ID', right_on='ID_rm', how='left', indicator=True)
new_df = new_df[new_df['_merge'] == 'left_only']
# Update p-value for the new dataframe:
age_AD = new_df[new_df['Group']=='AD']['Age']
tStat, T_pValue = ttest_ind(age_AD, age_CN, equal_var = True)
MW = mannwhitneyu(age_AD, age_CN)
MW_pValue = MW.pvalue
print('Iteration ', i, ': T = {}'.format(T_pValue), file=f)
print(' MW = {}'.format(MW_pValue), file=f)
contigency_train = pd.crosstab(new_df['Sex'], new_df['Group'])
c, pvalue_sex_matched, dof, expected = chi2_contingency(contigency_train)
print(' chi2 = {}'.format(pvalue_sex_matched), file=f)
i+=1
####################################################################################################################
# REMOVE CN:
####################################################################################################################
# sns.kdeplot(age_AD, shade=True, legend=True, color="red", cmap="Reds")
# sns.kdeplot(age_CN, shade=True, legend=True, cmap="Blues")
sns.histplot(data=age_AD, binwidth=1, kde=True, legend=True, color="red")
sns.histplot(data=age_CN, binwidth=1, kde=True, legend=True, color="blue")
handles = [mpatches.Patch(facecolor=plt.cm.Reds(100), label='AD'),
mpatches.Patch(facecolor=plt.cm.Blues(100), label='CN')]
plt.title('Decide the range in which remove CN subjects:')
plt.legend(loc='best', handles=handles)
plt.xlim(0,100)
plt.ylim(0,max(CN_df.groupby(['Age']).count()['ID'])+5)
#plt.show()
print('\nSet the range of age in which remove CN subjects:', file=f)
# min_age = input('Min: ')
# max_age = input('Max: ')
print('[{}, {}]'.format(min_age_CN, max_age_CN), file=f)
nM_AD = sum(AD_df['Sex']=='M')
nM_CN = sum(CN_df['Sex']=='M')
nF_AD = sum(AD_df['Sex']=='F')
nF_CN = sum(CN_df['Sex']=='F')
print('\nNumber of AD males: {}\nNumber of AD females: {}'.format(nM_AD, nF_AD), file=f)
print('Number of CN males: {}\nNumber of CN females: {}'.format(nM_CN, nF_CN), file=f)
# nM_R = input('Number of CN males to remove at each iteration: ')
# nF_R = input('Number of CN females to remove at each iteration: ')
print('In each iteration remove {} M and {} F from CN.\n'.format(nM_R_CN, nF_R_CN), file=f)
i=0
while (min(T_pValue,MW_pValue, pvalue_sex_matched)<p_thresholdCN) & (i<itermax):
# Separate males and females:
df_M = CN_df.query("Sex=='M'")
df_F = CN_df.query("Sex=='F'")
# Extract patients with AGE in the range [min_age, max_age]:
df_M_selected = df_M.query("{} < Age < {}".format(min_age_CN, max_age_CN))
df_F_selected = df_F.query("{} < Age < {}".format(min_age_CN, max_age_CN))
# Select the patients to be removed and memorize them
df_M_removed = df_M_selected.sample(n=int(nM_R_CN))
df_F_removed = df_F_selected.sample(n=int(nF_R_CN))
rm_df = pd.concat([rm_df,df_M_removed])
rm_df = pd.concat([rm_df,df_F_removed])
# Remove the selected subject from the CONT of the GROUP of interest:
rm_id = pd.DataFrame(rm_df['ID']).rename(columns={"ID": "ID_rm"})
new_df = merged_df.merge(rm_id, left_on='ID', right_on='ID_rm', how='left', indicator=True)
new_df = new_df[new_df['_merge'] == 'left_only']
# Update p-value for the new dataframe:
age_CN = new_df[new_df['Group']=='CN']['Age']
tStat, T_pValue = ttest_ind(age_AD, age_CN, equal_var = True)
MW = mannwhitneyu(age_AD, age_CN)
MW_pValue = MW.pvalue
print('Iteration ', i, ': T = {}'.format(T_pValue), file=f)
print(' MW = {}'.format(MW_pValue), file=f)
contigency_train = pd.crosstab(new_df['Sex'], new_df['Group'])
c, pvalue_sex_matched, dof, expected = chi2_contingency(contigency_train)
print(' chi2 = {}'.format(pvalue_sex_matched), file=f)
i+=1
plt.close('all')
# sns.kdeplot(age_AD, shade=True, legend=True, color="red", cmap="Reds")
# sns.kdeplot(age_CN, shade=True, legend=True, cmap="Blues")
sns.histplot(data=age_AD, binwidth=1, kde=True, legend=True, color="red")
sns.histplot(data=age_CN, binwidth=1, kde=True, legend=True, color="blue")
handles = [mpatches.Patch(facecolor=plt.cm.Reds(100), label='AD'),
mpatches.Patch(facecolor=plt.cm.Blues(100), label='CN')]
plt.title('Age distribution after matching')
plt.legend(loc='best', handles=handles)
plt.xlim(0,100)
plt.ylim(0,max(CN_df.groupby(['Age']).count()['ID'])+5)
#plt.show()
plt.savefig(img_path_after)
# Save themathed data in csv:
rm_df.to_csv(path_removed)
new_df.to_csv(path_matched)
# SUBJECTS COUNT:
images_xSubject = new_df.groupby(by=['Subject']).count()['Image Data ID']
images_xSubject.to_csv(path_count_matched)
n_subjects = len(images_xSubject)
n_AD = len(new_df[new_df['Group']=='AD'].groupby(by=['Subject']).count())
n_AD_images = len(new_df[new_df['Group']=='AD'])
n_CN = len(new_df[new_df['Group']=='CN'].groupby(by=['Subject']).count())
n_CN_images = len(new_df[new_df['Group']=='CN'])
n_AD_rm_img = len(rm_df[rm_df['Group']=='AD'])
n_AD_rm = len(rm_df[rm_df['Group']=='AD'].groupby(by=['Subject']).count())
n_CN_rm_img = len(rm_df[rm_df['Group']=='CN'])
n_CN_rm = len(rm_df[rm_df['Group']=='CN'].groupby(by=['Subject']).count())
print('\n', file=f)
print('Total number of subjects before matching: {}'.format(len(merged_df.groupby(by=['Subject']).count())), file=f)
print('Total number of subjects after matching: {} di cui AD patients: {} e CN subjects: {}'.format(len(new_df.groupby(by=['Subject']).count()), n_AD,n_CN), file=f)
print('\n', file=f)
print('Total number of images before matching: {}'.format(len(merged_df)), file=f)
print('Total number of images after matching: {} di cui AD images: {} e CN images: {}'.format(len(new_df), n_AD_images, n_CN_images), file=f)
print('The {} images ({} AD, {} CN) removed for age- and sex-matching between CN subjects and AD patients belong to {} subjects ({} AD patients and {} CN subjects)'.format(len(rm_df), n_AD_rm_img, n_CN_rm_img, len(rm_df.groupby(by=['Subject']).count()), n_AD_rm, n_CN_rm), file=f)
f.close()
# -----------------------------------
# Save the paths pf the image to concatenate:
if os.path.exists(path_txt_img):
os.remove(path_txt_img)
img_paths = open(path_txt_img, "a")
for i in new_df['path'].index:
print(new_df['path'][i][:-1], file=img_paths)
img_paths.close()