```
This file is part of Estimation of Causal Effects in the Alzheimer's Continuum (Causal-AD).

Causal-AD is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Causal-AD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Causal-AD. If not, see <https://www.gnu.org/licenses/>.
```

# Prepare ADNI Data

In [None]:
import logging
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

from causalad.adni.data import AdniDataLoader, apply_transform
from causalad.ukb.data import combine_by_group, get_lobes_map

sns.set(style="whitegrid")

logging.basicConfig(level=logging.INFO)

In [None]:
# Parameters
adni_csv_file: str = "adni-data.csv"
output_dir: str = "."

In [None]:
loader = AdniDataLoader(adni_csv_file, drop_outliers=False)

features, outcome = loader.load_freesurfer()
features["volumes"].drop('Total_Ventricular_CSF', axis=1, inplace=True)

outcome.shape

In [None]:

features, outcome = loader.load_freesurfer()
features["volumes"].drop('Total_Ventricular_CSF', axis=1, inplace=True)

outcome.shape

## Outcome

In [None]:
_, ax = plt.subplots(figsize=(10, 6))
outcome.plot.hist(bins=np.arange(55), density=True, ax=ax)

del ax

## Clinical

In [None]:
features["clinical"].head()

In [None]:
ax = features["clinical"].PTGENDER.value_counts(normalize=True).plot.bar(title="PTGENDER")

del ax

In [None]:
_, axs = plt.subplots(1, 2, figsize=(9, 4))

axs[0].hist(features["clinical"].AGE.values, bins="auto", density=True)
axs[0].set_title("AGE")

stats.probplot(features["clinical"].AGE.values, plot=axs[1])

del axs

In [None]:
ax = features["clinical"].ATN_status.value_counts(normalize=True).plot.bar(title="ATN_status")

del ax

In [None]:
_, axs = plt.subplots(1, 2, figsize=(9, 4))

features["clinical"].PTEDUCAT.value_counts(normalize=True).sort_index().plot.bar(
    title="PTEDUCAT", ax=axs[0]
)
features["clinical"].loc[:, "EDU-ATTAIN"].value_counts(normalize=True).plot.bar(
    title="EDU-ATTAIN", ax=axs[1]
)

del axs

In [None]:
_, axs = plt.subplots(3, 3, figsize=(14, 12), gridspec_kw={"hspace": 0.3})

for i, (name, col) in enumerate(features["clinical"].loc[:, ["ABETA", "TAU", "PTAU"]].iteritems()):
    axs[0, i].hist(col.values, bins="auto", density=True)
    axs[0, i].set_title(name)

    stats.probplot(col.values, plot=axs[1, i])

    stats.probplot(np.log1p(col.values), plot=axs[2, i])

del i, axs, name, col

## Volume

In [None]:
cor_mat = features["volumes"].corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(
    cor_mat, method="ward", metric="euclidean", annot=True, figsize=(12, 12), cmap="RdBu_r",
)

del cor_mat

## Thickness

References:
- https://radiopaedia.org/articles/cingulate-gyrus
- http://braininfo.rprc.washington.edu/centraldirectory.aspx?ID=159

In [None]:
lobes_map = get_lobes_map(features["thickness"])

lobes_map.loc["bankssts_thickness"] = "Temporal"

In [None]:
cor_mat = features["thickness"].corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(cor_mat, method="ward", metric="euclidean",
               row_cluster=False,
#                col_colors=lobes_map.loc[:, "color"],
               square=True, annot=True, figsize=(19, 19), cmap="RdBu_r")

del cor_mat

## Prune redundant measurements

In [None]:
thicks_pruned = combine_by_group(features["thickness"], lobes_map)

In [None]:
cor_mat = thicks_pruned.corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(cor_mat, method="ward", metric="euclidean",
               row_cluster=False,
#                col_colors=lobes_map.loc[:, "color"],
               square=True, annot=True, figsize=(19, 19), cmap="RdBu_r")

del cor_mat

In [None]:
features_t = features.copy()
features_t["thickness"] = thicks_pruned

## Transform Data

Such that it is normally distributed.

In [None]:
adni_data_t, adni_transforms = apply_transform(features_t)

In [None]:
def plot_normality_check(data_dict):
    for name, data in data_dict.items():
        print("===>", name)
        data = data.select_dtypes(include=[float])
        n_features = data.shape[1]
        n_cols = 5
        n_rows = int(np.ceil(n_features / n_cols))
        _, axs = plt.subplots(
            n_rows, n_cols, figsize=(n_cols * 3,  n_rows * 3),
            sharex=True,# sharey=True
        )
        for (a, b), ax in zip(data.iteritems(), axs.flat):
            stats.probplot(b, plot=ax)
            ax.set_title(a)

In [None]:
plot_normality_check(adni_data_t)

## Write data

In [None]:
def write_data(data, transforms, outcome, filename):
    with pd.HDFStore(filename, complib="lzo") as store:

        store.put("volumes", data["volumes"])
        store.put("thickness", data["thickness"])
        store.put("tiv", data["tiv"])
        store.put("clinical", data["clinical"], format="table")
        store.put("transforms", transforms)
        store.put("outcome", outcome.to_frame())

In [None]:
write_data(
    adni_data_t,
    adni_transforms,
    outcome,
    Path(output_dir) / "adni_data_t.h5",
)