In [1]:
import numpy as np
import pandas as pd
import pathlib
from sklearn.decomposition import PCA

import h5io
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib qt


In [2]:
DERIV_ROOT = pathlib.Path('/storage/store3/derivatives/biomag_hokuto_bids')
FEATURES_ROOT = DERIV_ROOT
BIDS_ROOT = pathlib.Path(
    '/storage/store/data/biomag_challenge/Biomag2022/biomag_hokuto_bids'
)
ROOT = pathlib.Path(
    '/storage/store/data/biomag_challenge/Biomag2022/biomag_hokuto'
)

RANDOM_STATE = 42

frequency_bands = {
    "low": (0.1, 1),
    "delta": (1, 4),
    "theta": (4.0, 8.0),
    "alpha": (8.0, 15.0),
    "beta_low": (15.0, 26.0),
    "beta_mid": (26.0, 35.0),
    "beta_high": (35.0, 49)
}
# frequency_bands = {
#     "beta_mid": (26.0, 35.0)
# }


def get_subjects_labels(all_subjects):
    train_subjects = []
    train_labels = []
    for subject in all_subjects:
        if subject.find('control') == 4:
            train_labels.append('control')
            train_subjects.append(subject)
        elif subject.find('mci') == 4:
            train_labels.append('mci')
            train_subjects.append(subject)
        elif subject.find('dementia') == 4:
            train_labels.append('dementia')
            train_subjects.append(subject)
    return train_subjects, train_labels


def get_site(labels, subjects):
    subjects_A = []
    subjects_B = []
    age_A = []
    age_B = []
    for label in labels:
        site_info = pd.read_excel(ROOT / 'hokuto_profile.xlsx', sheet_name=label)
        for i in range(site_info.shape[0]):
            subject = 'sub-' + site_info['ID'].iloc[i][7:]
            if site_info['Site'].iloc[i] == 'A' and subject in subjects:
                subjects_A.append(subject)
                age_A.append(site_info['Age'].iloc[i])
            if site_info['Site'].iloc[i] == 'B' and subject in subjects:
                subjects_B.append(subject)
                age_B.append(site_info['Age'].iloc[i])
    return subjects_A, subjects_B, age_A, age_B


def get_subjects_age(age, labels):
    subjects = []
    for label in labels:
        site_info = pd.read_excel(ROOT / 'hokuto_profile.xlsx', sheet_name=label)
        for i in range(site_info.shape[0]):
            if site_info['Age'].iloc[i]>= age:
                subjects.append('sub-' + site_info['ID'].iloc[i][7:])
    print(len(subjects))
    return subjects


In [3]:
all_subjects = get_subjects_age(50, ['control', 'dementia', 'mci'])
subjects_A, subjects_B, age_A, age_B = get_site(['control', 'dementia', 'mci'], all_subjects)
train_subjects, y = get_subjects_labels(subjects_A + subjects_B)

101


Looking at psd features

In [4]:
features = h5io.read_hdf5(FEATURES_ROOT / 'features_features_psd.h5')
X_psd = np.concatenate(
        [features[sub][None, :] for sub in train_subjects],
        axis=0
)

In [5]:
X_psd.shape

(101, 321)

In [6]:
pca = PCA(n_components=3)
X_PCA = pca.fit_transform(X_psd)
df = pd.DataFrame({
    'x': X_PCA[:, 0],
    'y': X_PCA[:, 1],
    'label': y,
    'site': ['A']*len(subjects_A) + ['B']*len(subjects_B),
    'age': age_A + age_B
})

In [7]:
plt.figure(2)
sns.scatterplot(x='x', y='y', data=df, hue='label', style='site', size = 'age', sizes=(40, 400), alpha=0.5)
plt.title('PCA of psd features')
plt.xlabel('1st component')
plt.ylabel('2nd component')
plt.show()

In [8]:
fig = plt.figure(3)
ax = fig.add_subplot(projection = '3d')

x_pca = X_PCA[:, 0]
y_pca = X_PCA[:, 1]
z_pca = X_PCA[:, 2]
c = []
for label in y:
    if label == 'control':
        c.append('b')
    elif label == 'dementia':
        c.append('r')
    elif label == 'mci':
        c.append('g')
s = []
for a in age_A + age_B:
    s.append(100*((a-50)/(93-50))**2)
ax.scatter(x_pca, y_pca, z_pca, c=c, s=s)
plt.show()