```
This file is part of Estimation of Causal Effects in the Alzheimer's Continuum (Causal-AD).

Causal-AD is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Causal-AD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Causal-AD. If not, see <https://www.gnu.org/licenses/>.
```

# Prepare UKB Data

- Load volume and thickness measurements
- Merge measurements of certain areas
- Divide volumes by TIV
- Apply Cox-Box transform to each measurement
- Standardize each measurement to zero mean and unti variance

In [None]:
import logging
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

from causalad.ukb import data
from causalad.ukb.estimate import fit_regress_out


logging.basicConfig(level=logging.INFO)

In [None]:
# define the parameters

csv_file: str = "ukb-data.csv"
output_dir: str = "."
num_sites: int = 3
seed: int = 21012171

In [None]:
loader = data.UKBDataLoader(csv_file, drop_outliers=False)
vols, thicks, demos = loader.load_freesurfer()

vols.shape, thicks.shape

## Volume

### FreeSurfer

In [None]:
cor_mat = vols.corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(
    cor_mat, method="ward", metric="euclidean", annot=True, figsize=(12, 12), cmap="RdBu_r",
)

del cor_mat

## Thickness

References:
- https://radiopaedia.org/articles/cingulate-gyrus
- http://braininfo.rprc.washington.edu/centraldirectory.aspx?ID=159

In [None]:
lobes_map = data.get_lobes_map(thicks)

In [None]:
cor_mat = thicks.corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(cor_mat, method="ward", metric="euclidean",
               row_cluster=False,
#                col_colors=lobes_map.loc[:, "color"],
               square=True, annot=True, figsize=(19, 19), cmap="RdBu_r")

del cor_mat

## Prune redundant measurements

In [None]:
thicks_pruned = data.prune_by_group(thicks, lobes_map)

In [None]:
cor_mat = thicks_pruned.corr(method="spearman")
cor_mat.values[np.diag_indices_from(cor_mat)] = 0.0
sns.clustermap(cor_mat, method="ward", metric="euclidean",
               row_cluster=False,
#                col_colors=lobes_map.loc[:, "color"],
               square=True, annot=True, figsize=(19, 19), cmap="RdBu_r")

del cor_mat

Combine Volume and Thickness Measures.

In [None]:
ukb_data = pd.concat((vols, thicks_pruned), axis=1)

ukb_data.shape

## Transform Data

Such that it is normally distributed.

In [None]:
def plot_normality_check(data):
    n_features = data.shape[1]
    n_cols = 5
    n_rows = int(np.ceil(n_features / n_cols))
    _, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 3,  n_rows * 3),
                          sharex=True, sharey=True)
    for (a, b), ax in zip(data.iteritems(), axs.flat):
        stats.probplot(b, plot=ax)
        ax.set_title(a)

In [None]:
# normalize volumes by dividing by eTIV
tiv = ukb_data.loc[:, "eTIV"]
ukb_data_t = ukb_data.drop("eTIV", axis=1)
vols_mask = ~ukb_data_t.columns.str.endswith("_thickness")
ukb_data_t.loc[:, vols_mask] = ukb_data_t.loc[:, vols_mask].div(tiv, axis=0)

del vols_mask

In [None]:
ukb_data_t, ukb_transforms = data.apply_transform(ukb_data_t)

In [None]:
plot_normality_check(ukb_data_t)

## Confounders

### Unobserved Confounder

In [None]:
def generate_unobserved_confounder(
    vol_thick_data: pd.DataFrame, obs_conv: pd.DataFrame,
) -> pd.DataFrame:
    # regress-out observed confounder
    causal_data = fit_regress_out(vol_thick_data, obs_conv)

    p = TSNE(
        n_components=2,
        learning_rate=10.0,
        perplexity=30,
        init="pca",
        random_state=seed,
    ).fit(causal_data.values)
    Xtc = MinMaxScaler().fit_transform(p.embedding_)

    km = KMeans(
        n_clusters=num_sites,
        init="k-means++",
        n_init=10,
        max_iter=1000,
        tol=1e-6,
        algorithm="full",
        random_state=seed,
    ).fit(Xtc)
    site_id = km.predict(Xtc)  # cluster by all features
    intercepts = np.arange(1, num_sites + 1)

    unobs_conf = pd.DataFrame(
        intercepts[site_id], index=vol_thick_data.index, columns=["unobserved_confounder"],
    )

    Xt = pd.DataFrame(Xtc, columns=["Dim1", "Dim2"])
    Xt.loc[:, "site"] = [f"S{i}" for i in site_id]
    sns.jointplot(data=Xt, x=Xt.columns[0], y=Xt.columns[1], hue="site", kind="kde")

    return unobs_conf

In [None]:
conf_unobs = generate_unobserved_confounder(ukb_data_t, data.get_volume_causes(demos))

In [None]:
conf_unobs.value_counts(normalize=True)

## Write data

In [None]:
def write_data(data, tiv, transforms, confounders, filename):
    with pd.HDFStore(filename, complib="lzo") as store:
        thicks = data.loc[:, data.columns.str.endswith("_thickness")]
        vols = data.drop(thicks.columns, axis=1)

        store.put("volumes", vols)
        store.put("thickness", thicks)
        store.put("tiv", pd.DataFrame(tiv))
        store.put("demographics", demos)
        store.put("transforms", transforms)
        store.put("confounders", confounders)

In [None]:
write_data(
    ukb_data_t,
    tiv,
    ukb_transforms,
    conf_unobs,
    Path(output_dir) / "ukb_data_t.h5",
)