In [None]:
import os

from sklearn.preprocessing import minmax_scale
import datasets
import numpy as np
import pandas as pd
import meta_roi
from mask import NiiMask
from scipy.special import softmax
from draw_results import plot_correlation_joint

def get_weights(mask, csv_prefix, gmv=True):
    if gmv:
        roi_models = meta_roi.meta_gmv(2, 0, mask, csv_prefix=csv_prefix, save_nii=False)
    else:
        roi_models = meta_roi.meta_ct(2, 0, mask, csv_prefix=csv_prefix, save_nii=False, save_gii=False)

    rois = []
    ess = []
    for k, model in roi_models.items():
        rois.append(k)
        ess.append(model.total_effect_size)

    data = {'roi': rois,
            'es': ess}

    # Convert the dictionary into DataFrame
    es_df = pd.DataFrame(data)
    es_df = es_df.sort_values(by=['roi'])

    ess = es_df['es'].values
    ess = np.abs(ess)
    softmax_ess = softmax(ess)
    return softmax_ess, es_df

def append_infos(center, target_label, center_names,
                person_names, MMSEs, ages, tivs,
                genders, origin_label):
    persons = center.get_by_label(target_label)
    if persons:
        center_names += [center.name for person in persons]
        person_names += [person.filename for person in persons]
        MMSEs += center.get_MMSEs(target_label)[0].tolist()
        ages += center.get_ages(target_label)[0].tolist()
        tivs += center.get_tivs(target_label)[0].tolist()
        genders += center.get_males(target_label)[0].tolist()
        origin_label += [target_label for person in persons]

    return center_names, person_names, MMSEs, ages, tivs, genders, origin_label

def get_features(all_features, prefix, center, target_label, pss=[], get_total=True, weights=None, avg='mean'):
    persons = center.get_by_label(target_label)
    if persons:
        features, *_ = center.get_csv_values(persons=persons, prefix=prefix, flatten=True)
        if get_total:
            if avg == 'mean':
                pss += np.mean(features, axis=1).tolist()
            elif avg == 'weighted' and weights is not None:
                for feature in features:
                    pss.append(np.dot(feature, weights))

        if all_features is None:
            all_features = features
        else:
            all_features = np.vstack((all_features, features))
    if get_total:
        return all_features, pss
    else:
        return all_features

In [None]:
centers = datasets.load_centers_all()

gmv = True
if gmv:
    origin_feature_prefix = 'neurocombat_gmv2/{}.csv'
    ps_prefix = 'ps_g_agt/{}.csv'
    meta_csv_prefix='roi_gmv_removed'
else:
    origin_feature_prefix = 'neurocombat_ct2/{}.csv'
    ps_prefix = 'ps_c_ag/{}.csv'
    meta_csv_prefix='roi_ct_removed'


center_names = []
person_names = []
MMSEs = []
ages = []
tivs = []
genders = []
origin_label = []

mask = NiiMask('./data/mask/rBN_Atlas_246_1mm.nii')
weights, es_df = get_weights(mask, meta_csv_prefix, gmv=gmv)

all_ps = None
pss = []

all_gmvs = None
gmv_ps = []

target_labels = [0, 1, 2]
for center in centers:
    for target_label in target_labels:
        center_names, person_names, MMSEs, ages, tivs, genders, origin_label = append_infos(center, target_label, center_names,
                                                                                        person_names, MMSEs, ages, tivs,
                                                                                        genders, origin_label)
        all_ps, pss = get_features(all_ps, ps_prefix, center, target_label, pss, get_total=True, weights=weights, avg='weighted')
        all_gmvs, gmv_ps = get_features(all_gmvs, origin_feature_prefix, center, target_label, gmv_ps, get_total=True, weights=weights, avg='weighted')

data = {'Center_name': center_names,
        'Person_name': person_names,
        'MMSE': MMSEs,
        'Age':ages,
        'TIV':tivs,
        'gender':genders,
        'origin_label':origin_label,
        'Mean_PS': pss,
        'Mean_GMV': gmv_ps,}

# Convert the dictionary into DataFrame
df = pd.DataFrame(data)

In [None]:
from scipy.stats import pearsonr
import seaborn as sns

a = es_df['es'].values.tolist()

x = []
y = []
hue = []

for i in range(3):
    mean_nc_ps = np.mean(all_ps[df['origin_label']==i], axis=0)
    print(pearsonr(mean_nc_ps,a))

    x += mean_nc_ps.tolist()
    y += a
    hue += [i for _ in mean_nc_ps]
    
data = {'x':x,
    "y":y,
    'hue':hue}
data_df = pd.DataFrame(data)

g = sns.lmplot(
    data=data_df,
    x='x', y='y', hue='hue',
    height=5, scatter_kws={'alpha':0.4},
    palette='Set2'
)
import matplotlib.pyplot as plt
plt.show()

In [None]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("white")
sns.displot(df, height=5, aspect=2, x='Mean_PS', hue='origin_label', palette='Set2')
plt.show()

In [None]:
print('NC')
print(np.mean(df['Mean_PS'][df['origin_label'].isin([0])]))
print(np.std(df['Mean_PS'][df['origin_label'].isin([0])]))
print('MCI')
print(np.mean(df['Mean_PS'][df['origin_label'].isin([1])]))
print(np.std(df['Mean_PS'][df['origin_label'].isin([1])]))
print('AD')
print(np.mean(df['Mean_PS'][df['origin_label'].isin([2])]))
print(np.std(df['Mean_PS'][df['origin_label'].isin([2])]))

In [None]:
mean_nc_ps = np.mean(all_ps[df['origin_label']==2], axis=0)
print(np.argmin(mean_nc_ps), mean_nc_ps[np.argmin(mean_nc_ps)])
print(np.argmax(mean_nc_ps), mean_nc_ps[np.argmax(mean_nc_ps)])

In [None]:
mean_nc_ps[192]

In [None]:
x = all_ps[:, 87]
hue = df['origin_label']
sns.displot(df, aspect=2, x=x, hue=hue, palette='Set2', bins=30)
plt.show()

In [None]:
x = all_ps[:, 57]
hue = df['origin_label']
sns.displot(df,aspect=2, x=x, hue=hue, palette='Set2', bins=30)
plt.show()

In [None]:
plot_correlation_joint(df['Mean_PS'], df['MMSE'], x_label='Peasonal Score', y_label='MMSE')

In [None]:
from scipy.stats import pearsonr
for i in range(246):
    r, p = pearsonr(all_ps[:, i], df['Age'])
    print(i, r, p)

In [None]:
plot_correlation_joint(df['TIV'], df['Mean_GMV'], y_label='Weighted Sum GMV',  x_label='TIV')

In [None]:
plot_correlation_joint(df['TIV'], df['Mean_PS'], y_label='Individual Score', x_label='TIV')

In [None]:
plot_correlation_joint(df['Age']*100,df['Mean_PS'], y_label='Individual Score', x_label='Age')

In [None]:
plot_correlation_joint(df['Age']*100,df['Mean_GMV'], y_label='Weighted Sum GMV', x_label='Age')

# Classifcation

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import copy

k_fold = KFold(10)

X = copy.deepcopy(all_ps)
y = df['origin_label'].values
a = df['Center_name'].values
b = df['Person_name'].values
c = df['Mean_PS'].values

# shuffle
p = np.random.permutation(len(X))
X = X[p]
y = y[p]
a = a[p]
b = b[p]
c = c[p]

f1s = []
new_center_names = []
new_person_names = []
new_labels = []
new_person_scores = []
for k, (train, test) in enumerate(k_fold.split(X, y)):
    model = RandomForestClassifier()
    scaler = StandardScaler()
    x_train = X[train]
    x_train = scaler.fit_transform(x_train)

    model.fit(x_train, y[train])
    x_test = scaler.transform(X[test])

    y_pred = model.predict(x_test)

    f1 = f1_score(y[test], y_pred, average='macro')
    cm = confusion_matrix(y[test], y_pred)
    print(f'fold:{k}, f1:{f1}')
    print(cm)
    f1s.append(f1)

    new_center_names += a[test].tolist()
    new_person_names += b[test].tolist()
    new_labels += y_pred.tolist()
    new_person_scores += c[test].tolist()

predict_dict = {
'Center_name':new_center_names,
'Person_name':new_person_names,
'Predict_label':new_labels
}
predict_df = pd.DataFrame(predict_dict)
predict_df = predict_df.merge(df, on=['Center_name', 'Person_name'])
print(np.mean(f1s), np.std(f1s))

### Check Misclassified Personal Score

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sub_df = predict_df.query('origin_label==1 & Predict_label==0')
sns.displot(sub_df, x='Mean_PS', palette='Set2')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sub_df = predict_df.query('origin_label==1 & Predict_label==2')
sns.displot(sub_df, x='Mean_PS', palette='Set2')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sub_df = predict_df.query('origin_label==2 & Predict_label==0')
sns.displot(sub_df, x='Mean_PS', palette='Set2')
plt.show()

In [None]:
sns.boxenplot(data=predict_df, y='Mean_PS', x='Predict_label', palette='Set2')
plt.show()

In [None]:
sub_df = predict_df.query('origin_label==1')
sns.boxenplot(data=sub_df, x='Predict_label', y='Mean_PS', palette='Set2')
plt.show()

### Check Misclassified Age, gender, tiv, MMSE

In [None]:
sub_df = predict_df.query('origin_label==1')
sns.boxenplot(data=sub_df, x='Predict_label', y='Age', palette='Set2')
plt.show()

In [None]:
sub_df = predict_df.query('origin_label==1')
sns.boxenplot(data=sub_df, x='Predict_label', y='TIV', palette='Set2')
plt.show()

In [None]:
sub_df = predict_df.query('origin_label==1')
sns.boxenplot(data=sub_df, x='Predict_label', y='MMSE', palette='Set2')
plt.show()

### Check Misclassified ADNI clinical features

In [None]:
sns.set_style("white")

def get_clinical_value(info_df, df, clinical_name):
    values = []
    for label, row in df.iterrows():
        try:
            series = info_df.loc[row['Person_name']]
            value = series[clinical_name]
            if isinstance(value, str):
                continue
            if not np.isnan(value):
                values.append(float(value))
        except KeyError:
            pass
    return values

info_df = pd.read_csv('./data/center_info/ADNI/ADNIMERGE_BL.csv', index_col=0)
column_names = ['FAQ', 'FDG', 'ABETA', 'TAU', 'PTAU', 'ADAS11', 'ADAS13', 'ADASQ4',
                'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting']

sub_df1 = predict_df.query('origin_label==1 & Predict_label==0')
sub_df2 = predict_df.query('origin_label==1 & Predict_label==1')
sub_df3 = predict_df.query('origin_label==1 & Predict_label==2')

for column_name in column_names:
    values1 = get_clinical_value(info_df, sub_df1, column_name)
    values2 = get_clinical_value(info_df, sub_df2, column_name)
    values3 = get_clinical_value(info_df, sub_df3, column_name)
    values = values1 + values2 + values3
    predict_labels = [0 for _ in values1] + [1 for _ in values2] + [2 for _ in values3]
    ax = sns.boxenplot(y=values, x=predict_labels, palette='Set2')
    ax.set_title(column_name)
    plt.show()

In [None]:
tmp_df = info_df.merge(predict_df, left_on='PTID', right_on='Person_name')
ax = sns.boxenplot(data=tmp_df, y='ABETA', x='origin_label', palette='Set2')
plt.show()

# Subtype

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, AgglomerativeClustering, kmeans_plusplus
from sklearn.decomposition import NMF
from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA

tmpx = []
tmpy1 = []
tmpy2 = []
tmpy3 = []

input_features = all_ps[df['origin_label'].isin([1, 2])]
input_features = minmax_scale(input_features, axis=1)

for k in range(2, 10):
    method = AgglomerativeClustering(k)
    clustering = method.fit(input_features)

    labels = method.labels_
    sil = silhouette_score(input_features, labels)
    cal = calinski_harabasz_score(input_features, labels)
    dav = davies_bouldin_score(input_features, labels)
    tmpx.append(k)
    tmpy1.append(sil)
    tmpy2.append(cal)
    tmpy3.append(dav)
plt.plot(tmpx, tmpy1, label='silhouette', linewidth=3)
plt.show()
plt.plot(tmpx, tmpy2, label='calinski')
plt.show()
plt.plot(tmpx, tmpy3, label='davies')
plt.show()

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
import pickle

cluster = 4

# Features performed Clustering
sub_df = df[df['origin_label'].isin([1,2])]
all_features = all_ps
all_features = all_features[df['origin_label'].isin([1,2])]

method = KMeans(n_clusters=cluster)
clustering = method.fit(all_features)
sub_df['Subtype_label'] = clustering.labels_.tolist()

In [None]:
out_dir = f'./results_0401/subtype/g_agt{cluster}'
sub_df.to_csv(os.path.join(out_dir, 'subtype.csv'))

with open(os.path.join(out_dir, 'clustering.pkl'), 'wb') as f:
    pickle.dump(clustering, f)

In [None]:
from scipy.stats import ttest_ind
from mask import NiiMask
from gene_analysis import plsr
import pickle

mask_path = './data/mask/rBN_Atlas_246_1mm.nii'
mask = NiiMask(mask_path)

sub_df = pd.read_csv(os.path.join(out_dir, 'subtype.csv'))
all_labels = sub_df['Subtype_label'].values

# Features performed T-test using existing subtype labels
all_features = all_ps
all_nc_features = all_features[df['origin_label'].values==0]
all_features = all_features[df['origin_label'].isin([1,2])]

ls = np.unique(all_labels)
for l in ls:
    all_features_label = None
    for feature, label in zip(all_features, all_labels):
        if label == l:
            if all_features_label is None:
                all_features_label = feature
            else:
                all_features_label = np.vstack((all_features_label, feature))
    ts, ps = ttest_ind(all_features_label, all_nc_features, axis=0)

    all_ts = dict(zip(range(1, len(ts)+1), ts))
    model = plsr(all_ts, n_components=5, 
                n_perm=1, n_boot=1,
                gene_path='./data/gene/expression.csv',
                out_path=os.path.join(out_dir,f'gene/{l}.csv'))
    with open(os.path.join(out_dir, f'gene/model_{l}.pkl'), 'wb') as f:
        pickle.dump(model, f)

    ts = [t if p<0.001/len(ts) else 0 for t, p in zip(ts, ps)]
    ts = dict(zip(range(1, len(ts)+1), ts))
    mask.save_values(ts, os.path.join(out_dir, f'subtype{l}.nii'))

In [None]:
for l in ls:
    with open(os.path.join(out_dir, f'gene/model_{l}.pkl'), 'rb') as f:
        model = pickle.load(f)
    print(model.varexp)
    print(model.permres.pvals)

# subtype plot

In [None]:
import pandas as pd
import pickle
import seaborn as sns

load_dir = f'./results_0401/subtype/g_agt4'
sub_df = pd.read_csv(os.path.join(load_dir, 'subtype.csv'))
sub_df['Age'] = sub_df['Age']*100
with open(os.path.join(load_dir, 'clustering.pkl'), 'rb') as f:
    method = pickle.load(f)

print(sub_df.head())

In [None]:
color = ['#3caf77', '#4d5aaf', '#ffd900', '#d15d55', '#ef7b1b', "#34a0b1"]

In [None]:
# Reset subtype values for visual display
sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12
sub_df.loc[sub_df.Subtype_label == 2, 'Subtype_label'] = 13
sub_df.loc[sub_df.Subtype_label == 3, 'Subtype_label'] = 14

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 4
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 3
sub_df.loc[sub_df.Subtype_label == 13, 'Subtype_label'] = 1
sub_df.loc[sub_df.Subtype_label == 14, 'Subtype_label'] = 2

In [None]:
# Reset subtype values for visual display
sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12
sub_df.loc[sub_df.Subtype_label == 2, 'Subtype_label'] = 13
sub_df.loc[sub_df.Subtype_label == 3, 'Subtype_label'] = 14

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 4
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 3
sub_df.loc[sub_df.Subtype_label == 13, 'Subtype_label'] = 1
sub_df.loc[sub_df.Subtype_label == 14, 'Subtype_label'] = 2

In [None]:
# Reset subtype values for visual display
sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 2
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 1

In [None]:
# Reset subtype values for visual display
sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12
sub_df.loc[sub_df.Subtype_label == 2, 'Subtype_label'] = 13

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 3
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 2
sub_df.loc[sub_df.Subtype_label == 13, 'Subtype_label'] = 1

In [None]:
# Reset subtype values for visual display
sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12
sub_df.loc[sub_df.Subtype_label == 2, 'Subtype_label'] = 13
sub_df.loc[sub_df.Subtype_label == 3, 'Subtype_label'] = 14
sub_df.loc[sub_df.Subtype_label == 4, 'Subtype_label'] = 15
sub_df.loc[sub_df.Subtype_label == 5, 'Subtype_label'] = 16

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 4
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 6
sub_df.loc[sub_df.Subtype_label == 13, 'Subtype_label'] = 5
sub_df.loc[sub_df.Subtype_label == 14, 'Subtype_label'] = 3
sub_df.loc[sub_df.Subtype_label == 15, 'Subtype_label'] = 2
sub_df.loc[sub_df.Subtype_label == 16, 'Subtype_label'] = 1

#### TSNE

In [None]:
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2)
input_features = all_ps[df['origin_label'].isin([1,2])]
input_features = np.array(input_features)
embeded_features = tsne.fit_transform(input_features)
x = embeded_features.T[0]
y = embeded_features.T[1]

In [None]:
np.save('./results_0401/GMV_tsne_x.npy', x)
np.save('./results_0401/GMV_tsne_y.npy', y)

In [None]:
with open(os.path.join('./results_0401', 'tsne.pkl'), 'wb') as f:
    pickle.dump(tsne, f)


In [None]:
with open(os.path.join('./results_0401', 'tsne.pkl'), 'rb') as f:
    tsne = pickle.load(f)

In [None]:
hue = df['origin_label'][df['origin_label'].isin([1,2])]
sns.relplot(x=x, y=y, hue=hue, palette=sns.color_palette(color, len(np.unique(hue))))
plt.show()

hue = sub_df['Subtype_label']
sns.relplot(x=x, y=y, hue=hue, palette=sns.color_palette(color, len(np.unique(hue))))
plt.show()

In [None]:
# Use TSNE xy to cluster
tmp = np.vstack([x,y]).T
method = AgglomerativeClustering(n_clusters=4)
clustering = method.fit(tmp)
tmp_label = clustering.labels_.tolist()
sns.relplot(x=x, y=y, hue=tmp_label, palette=sns.color_palette(color, len(np.unique(tmp_label))))
plt.show()

#sub_df['Subtype_label'] = clustering.labels_.tolist()
#sub_df.to_csv(os.path.join(out_dir, 'subtype.csv'))

#### center pie plot

In [None]:
import numpy as np

def f(row):
    return row['Center_name'][:4]
sub_df['Dataset'] = sub_df.apply(f, axis=1)
array = sub_df.groupby(["Dataset","Subtype_label"]).size()
print(array)
print(array/np.sum(array)*100)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
radius = 1.5
size = 0.7

def f(row):
    return row['Center_name'][:4]
sub_df['Dataset'] = sub_df.apply(f, axis=1)

array = sub_df.groupby(["Dataset","Subtype_label"]).size()

labels = []
cs = []
for i in array.index:
    a, b = i
    labels.append(f'{a}_{b}')
    cs.append(color[b])

fig, ax = plt.subplots()
ax.pie(array, 
        labels=labels,
        pctdistance=0.82,
        radius=radius, colors=cs,
        wedgeprops=dict(width=size, edgecolor='w'))
ax.pie(sub_df.groupby(["Dataset","Subtype_label",'origin_label']).size(),
        radius=radius-size, colors=['#dddddd', '#aaaaaa'],
        wedgeprops=dict(width=0.2, edgecolor='w'))
plt.show()

#### Gender Count

In [None]:
gender_color = ['#9dd5fe', '#fec69d']
edgecolor = ['#3cacfd', '#fd8d3c']

x = np.unique(sub_df['Subtype_label'])
width = 0.4
linewidth = 2
height = sub_df.groupby(["Subtype_label", 'origin_label', "gender"]).size()
print(height)

mci_female_count = height[::4]
mci_male_count = height[1::4]
ad_female_count = height[2::4]
ad_male_count = height[3::4]

plt.bar(x, mci_male_count, width=-width, align='edge',
            label='MCI_Male', color=gender_color[0], edgecolor=edgecolor[0],
            linewidth=linewidth)
plt.bar(x, mci_female_count, width=-width,
            bottom=mci_male_count, align='edge',
            label='MCI_Female', color=gender_color[1], edgecolor=edgecolor[1],
            linewidth=linewidth)
plt.bar(x, ad_male_count, width=width, align='edge',
            label='AD_Male', color=gender_color[0], edgecolor=edgecolor[0],
            linewidth=linewidth)
plt.bar(x, ad_female_count, width=width,
            bottom=ad_male_count,align='edge',
            label='AD_Female', color=gender_color[1], edgecolor=edgecolor[1],
            linewidth=linewidth)

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
#plt.savefig(os.path.join(out_dir, f'gender.jpg'))
plt.close()

#### All dataset Stats

In [None]:
from scipy.stats import ttest_ind

def plot_subtype_stat(df, name, out_dir, palette, boxen_width=0.6):
    ls = np.unique(df['Subtype_label'])
    clinical_features = [[] for _ in ls]
    x = []
    y = []
    for label, row in df.iterrows():
        x.append(row['Subtype_label'])
        y.append(row[name])
        clinical_features[row['Subtype_label']-1].append(row[name])
    fig = plt.figure(figsize=(4,4))
    ax = fig.add_subplot()
    ax = sns.boxenplot(x=x, y=y, palette=palette, width=boxen_width, saturation=1)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.savefig(os.path.join(out_dir, f'{name}.jpg'))
    plt.close()
    with open(os.path.join(out_dir, "t.txt"), "a") as f:
        print(f'----------{name}--------------', file=f)
        label_pairs = []
        for l in ls:
            for ll in ls:
                if ll > l:
                    label_pairs.append([l-1, ll-1])
        for label_pair in label_pairs:
            array_1 = np.array(clinical_features[label_pair[0]])
            array_1 = array_1[~np.isnan(array_1)]
            array_2 = np.array(clinical_features[label_pair[1]])
            array_2 = array_2[~np.isnan(array_2)]
            t, p = ttest_ind(array_1, array_2)
            print(name, label_pair[0]+1, label_pair[1]+1, t, p, array_1.shape[0], array_2.shape[0], file=f)
        print(f'----------------------------', file=f)

out_dir = './results_0401/subtype/g_agt4'
plot_subtype_stat(sub_df, 'MMSE', out_dir, sns.color_palette(color, 4))
plot_subtype_stat(sub_df, 'Mean_PS', out_dir, sns.color_palette(color, 4))
plot_subtype_stat(sub_df, 'Age', out_dir, sns.color_palette(color, 4))

#### ADNI Stats

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("white")

def get_clinical_value(info_df, df, clinical_name):
    values = []
    for label, row in df.iterrows():
        try:
            series = info_df.loc[row['Person_name']]
            value = series[clinical_name]
            if isinstance(value, str):
                continue
            if not np.isnan(value):
                values.append(float(value))
        except KeyError:
            pass
    return values

info_df = pd.read_csv('./data/center_info/ADNI/ADNIMERGE_BL.csv', index_col=0)
column_names = ['FAQ', 'FDG', 'ABETA', 'TAU', 'PTAU', 'ADAS11', 'ADAS13', 'ADASQ4',
                'RAVLT_immediate', 'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting']

sub_df1 = sub_df.query('Subtype_label==1')
sub_df2 = sub_df.query('Subtype_label==2')
sub_df3 = sub_df.query('Subtype_label==3')

with open(os.path.join(out_dir, "t.txt"), "a") as f:
    for column_name in column_names:
        values1 = get_clinical_value(info_df, sub_df1, column_name)
        values2 = get_clinical_value(info_df, sub_df2, column_name)
        values3 = get_clinical_value(info_df, sub_df3, column_name)
        values = values1 + values2 + values3
        predict_labels = [1 for _ in values1] + [2 for _ in values2] + [3 for _ in values3]
        fig = plt.figure(figsize=(4,4))
        ax = fig.add_subplot()
        ax = sns.boxenplot(y=values, x=predict_labels, palette=sns.color_palette(color, 3))
        ax.set_title(column_name)
        plt.savefig(f'./results_0401/tmp/{column_name}.png')
        plt.close()
        #plt.show()

        clinical_features = [values1, values2, values3]

        print(f'-------{column_name}----------', file=f)
        ls = [1, 2, 3]
        label_pairs = []
        for l in ls:
            for ll in ls:
                if ll > l:
                    label_pairs.append([l-1, ll-1])
        for label_pair in label_pairs:
            array_1 = np.array(clinical_features[label_pair[0]])
            array_1 = array_1[~np.isnan(array_1)]
            array_2 = np.array(clinical_features[label_pair[1]])
            array_2 = array_2[~np.isnan(array_2)]
            t, p = ttest_ind(array_1, array_2)
            
            print(column_name, label_pair[0]+1, label_pair[1]+1, t, p, array_1.shape[0], array_2.shape[0], file=f)
        print(f'----------------------------', file=f)

In [None]:
with open(os.path.join(out_dir, "t.txt"), "a") as f:
    for column_name in column_names:
        info_df = pd.read_csv('./data/center_info/ADNI/ADNIMERGE_BL.csv', index_col=0)
        ls = np.unique(df['Subtype_label'])
        clinical_features = [[] for _ in ls]
        x = []
        y = []
        for label, row in df.iterrows():
            try:
                series = info_df.loc[row['Person_name']]
                value = series[column_name]
                if isinstance(value, str):
                    continue
                if not np.isnan(value):
                    x.append(row['Subtype_label'])
                    y.append(float(value))
                    clinical_features[row['Subtype_label']].append(float(value))
            except KeyError:
                pass
        print([len(clinical_features[i]) for i in range(len(ls))])
        fig = plt.figure(figsize=(3,4))
        ax = fig.add_axes()
        ax = sns.boxenplot(x=x, y=y, palette=palette, width=boxen_width, saturation=1)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

        plt.savefig(os.path.join(out_dir, f'{column_name}.jpg'))
        plt.close()
    
        print(f'-------{column_name}----------', file=f)
        label_pairs = []
        for l in ls:
            for ll in ls:
                if ll > l:
                    label_pairs.append([l, ll])
        for label_pair in label_pairs:
            array_1 = np.array(clinical_features[label_pair[0]])
            array_1 = array_1[~np.isnan(array_1)]
            array_2 = np.array(clinical_features[label_pair[1]])
            array_2 = array_2[~np.isnan(array_2)]
            t, p = ttest_ind(array_1, array_2)
            
            print(label_pair, t, p, array_1.shape, array_2.shape, file=f)
        print(f'----------------------------', file=f)

# Personal scores vs ADNI PET

In [None]:
import abeta_pet
import pet_fdg
abeta_t, _ = abeta_pet.ttest_by_label(2, 0)
fdg_t, _ =  pet_fdg.ttest_by_label(2, 0)

In [None]:
from scipy.stats import ttest_ind, pearsonr
from draw_results import plot_correlation_joint

load_dir = f'./results_0401/subtype/g_agt4'
all_features = all_ps
all_nc_features = all_features[df['origin_label'].values==0]
all_features = all_features[df['origin_label'].isin([1,2])]

sub_df = pd.read_csv(os.path.join(load_dir, 'subtype.csv'))

sub_df.loc[sub_df.Subtype_label == 0, 'Subtype_label'] = 11
sub_df.loc[sub_df.Subtype_label == 1, 'Subtype_label'] = 12
sub_df.loc[sub_df.Subtype_label == 2, 'Subtype_label'] = 13
sub_df.loc[sub_df.Subtype_label == 3, 'Subtype_label'] = 14

sub_df.loc[sub_df.Subtype_label == 11, 'Subtype_label'] = 4
sub_df.loc[sub_df.Subtype_label == 12, 'Subtype_label'] = 3
sub_df.loc[sub_df.Subtype_label == 13, 'Subtype_label'] = 1
sub_df.loc[sub_df.Subtype_label == 14, 'Subtype_label'] = 2

all_labels = sub_df['Subtype_label'].values
print(all_features.shape)
print(all_labels.shape)

ls = np.unique(all_labels)
for l in ls:
    all_features_label = None
    for feature, label in zip(all_features, all_labels):
        if label == l:
            if all_features_label is None:
                all_features_label = feature
            else:
                all_features_label = np.vstack((all_features_label, feature))
    ts, ps = ttest_ind(all_features_label, all_nc_features, axis=0)
    plot_correlation_joint(ts, list(abeta_t.values()),
                          x_label=f'Subtype{l}-NC ROI t-values', y_label='AD-NC Abeta ROI t-values',
                          save=True,
                          out_path=f'./results_0401/tmp/Subtype{l}_abeta.png')
    plot_correlation_joint(ts, list(fdg_t.values()),
                            x_label=f'Subtype{l}-NC ROI t-values', y_label='AD-NC FDG ROI t-values',
                            save=True,
                          out_path=f'./results_0401/tmp/Subtype{l}_FDG.png')

In [None]:
from abeta_pet import create_subject_df
subject_df = create_subject_df()
subject_df = subject_df.set_index('Name')
subject_df

In [None]:
from scipy.stats import pearsonr
rs = []
ps = []
i = 0
for k, row in df.iterrows():
    pss = all_ps[i]
    try:
        pet_row = subject_df.loc[row['Person_name']]
        #if pet_row['Label'] == 1:
        pet_values = pet_row[4:].values
        r, p = pearsonr(pss, pet_values)
        rs.append(r)
        ps.append(p)
    except KeyError:
        pass

    i += 1

import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(rs)
plt.show()

In [None]:
for i in range(246):
    a = all_ps[:, i]
    b = subject_df[f'{i+1}'].values
    r, p = pearsonr(a, b)
    print(r)

#### FDG

In [None]:
from pet_fdg import create_subject_df
subject_df = create_subject_df()
subject_df = subject_df.set_index('Name')
subject_df

In [None]:
from scipy.stats import pearsonr
rs = []
ps = []
i = 0
for k, row in df.iterrows():
    pss = all_ps[i]
    try:
        pet_row = subject_df.loc[row['Person_name']]
        fdg_df = pd.read_csv(pet_row['sum_path'])
        pet_values = fdg_df['GMV'].values
        r, p = pearsonr(pss, pet_values)
        rs.append(r)
        ps.append(p)
    except KeyError:
        pass

    i += 1

import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(rs)
plt.show()