# Initialization

## Packages

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from itertools import product

from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import roc_auc_score, roc_curve

## Environment

In [2]:
# Environment variables
PROJECTPATH = os.getenv('PROJECTPATH')

## Functions

In [3]:
def build_cluster_pairs(k):

    pairs = []
    for pair in product(k, k):
        pair = tuple(sorted(pair))
        if pair[0] != pair[1]:
            if pair not in pairs:
                pairs.append(pair)
                
    return pairs


def prepare_data(clusters, scores, demographics, nk = 2, threshold = 0.5, return_features = False):
    
    nk_col = 'nk{}'.format(nk)
    
    # Compute completion rate for the different assessments
    nparticipants = scores.shape[0]
    completion = dict()
    for col, vals in scores.items():
        if col != 'Subject_ID':
            completion[col] = vals.notna().sum()/nparticipants
            
    # Filter scales above completion threshold
    scales = [col for col, val in completion.items() if val > threshold]        
    
    # Get the scores for the subset of scales
    scores_subset = scores[scales].copy()
    scores_subset['Subject_ID'] = scores['Subject_ID']
    
    # Join scores and cluster information
    clusters_scores = clusters.copy()
    clusters_scores = (clusters_scores
     .rename(columns = {'ID':'file'})
     .loc[:, ['file', nk_col]]
     .merge(demographics, how = 'left', on = 'file'))

    # Filter for POND
    clusters_scores = clusters_scores.loc[clusters_scores['Dataset'] == 'POND']

    # Clean up IDs for merging
    clusters_scores['Subject_ID'] = (clusters_scores['Subject_ID']
                                     .str.replace('sub-', '')
                                     .astype(int))

    # Merge clusters to scores
    clusters_scores = (clusters_scores
                       .loc[:, ['Subject_ID', nk_col]]
                       .merge(scores_subset, on = 'Subject_ID', how = 'left'))

    # Keep only complete observations
    clusters_scores = clusters_scores.dropna()
    
    # Create the input matrix and binary targets
    X = clusters_scores.drop(['Subject_ID', nk_col], axis = 1)
    features = X.columns.to_list()
    X = X.to_numpy()
    y = np.array(clusters_scores[nk_col], dtype = int)-1
    
    if return_features:
        return X,y,features
    else:
        return X,y
    
    
def run_plsda(X, y):
    
    # Maximum number of possible components
    max_components = X.shape[1]-1
    
    # Range of model components
    component_range = range(2, max_components)

    # Iterate over model components
    auc = []
    for nc in component_range:

        # Initiatlize the PLSR module
        plsr = PLSRegression(n_components = nc, scale = True)

        # Fit the model to the data
        plsr.fit(X, y)

        # Predict cluster labels
        y_pred = plsr.predict(X)

        # Clamp the interval since this isn't a proper classifier
        y_pred[y_pred > 1.0] = 1.0
        y_pred[y_pred < 0] = 0

        # Compute AUC
        auc.append(roc_auc_score(y, y_pred))

    # Store AUC information in data frame
    df_auc = pd.DataFrame({'components':component_range, 'auc':auc})

    return df_auc
    
    
def export_scores_loadings(X, y, features, labels):    

    labels = pair

    # Number of scales
    nscales = X.shape[1]

    # Initialize the PLSR
    plsr = PLSRegression(n_components = 2, scale = True)

    # Fit the model
    plsr.fit(X, y)

    # Get scores and loadings
    X_pls = plsr._x_scores
    loadings = plsr.x_loadings_

    X_PLS_norm = np.sqrt(np.sum(X_pls**2, axis = 1))
    loadings_norm = np.sqrt(np.sum(loadings**2, axis = 1))

    X_PLS_norm_max = np.max(X_PLS_norm)
    loadings_norm_max = np.max(loadings_norm)
    scale_factor = X_PLS_norm_max/loadings_norm_max

    loadings = loadings*scale_factor

    df_pls = pd.DataFrame(X_pls, columns=['x', 'y'])
    df_pls['cluster'] = [labels[i] for i in y]

    outfile = 'plsda_{}_scores.csv'.format('_'.join(pair))
    outfile = os.path.join(output_dir, outfile)
    df_pls.to_csv(outfile, index = False)

    df_loadings = pd.DataFrame(loadings, columns=['x', 'y'])
    df_loadings['features'] = features

    outfile = 'plsda_{}_loadings.csv'.format('_'.join(pair))
    outfile = os.path.join(output_dir, outfile)
    df_loadings.to_csv(outfile, index = False)

---
# POND analysis

## Data preparation

In [4]:
# Parameter set ID
params_id = 700

# Output directory
output_dir = 'figure_supplementary'
output_dir = os.path.join(PROJECTPATH, 'figures', 'v3', output_dir)

# Input directories
registration_dir = 'data/human/registration/v3/'
pipeline_dir = 'data/human/derivatives/v3/'

registration_dir = os.path.join(PROJECTPATH, registration_dir)
pipeline_dir = os.path.join(PROJECTPATH, pipeline_dir, str(params_id))

# Demographics file
demographics = os.path.join(registration_dir, 'subject_info', 'demographics.csv')
demographics = pd.read_csv(demographics)

# POND clinical scores
scores = os.path.join(registration_dir, 'subject_info', 'POND', 'POND_clinical_scores_20230915.csv')
scores = pd.read_csv(scores)

# Cluster solutions
cluster_dir = os.path.join(pipeline_dir, 'clusters', 'resolution_3.0')
cluster_file = os.path.join(cluster_dir, 'clusters.csv')
clusters = pd.read_csv(cluster_file)

# Drop columns
cols_to_drop = ['Unnamed: 0', 'site', 'SUB_ID', 
                'DOB', 'PRIMARY_DIAGNOSIS', 
                'RESEARCH_CONFIRM_DIAG', 
                'HSHLD_INCOME_STD', 
                'PRMY_CGVR_STD',
               'SWANPDOC', 'TPOCSPDOC']
scores = scores.drop(cols_to_drop, axis = 1)

# Drop columns containing the following strings
strings_to_drop = ['NSI', 'ETHNCTY', 'EDUC']
for s in strings_to_drop:
    scores = scores.loc[:, ~scores.columns.str.contains(s)]

# Rename the subject ID column for merging
scores = scores.rename(columns = {'subject':'Subject_ID'})

# Assign NaN to missing values 999 code
for col, vals in scores.items():
    x = vals.copy()
    x[x == 999] = np.nan
    scores[col] = x

## Pairwise PLS-DA across cluster solutions

In [5]:
# Maximum number of cluster solutions
nk_max = 10

# Cluster solutions
nk_list = list(range(2, nk_max+1))

# Completion thresholds
thresholds = [0.6, 0.8]

# Iterate over cluster solutions
df_results = pd.DataFrame()
for nk in nk_list:
    
    # Number of clusters
    klist = list(range(1, nk+1))

    # Pairs of clusters
    kpairs = build_cluster_pairs(k = klist)

    # Iterate over cluster pairs
    for k in kpairs:

        # Get cluster IDs
        cluster_ids = ['{}-{}'.format(nk, ki) for ki in k]

        # Decrement cluster labels by 1
        k = [x-1 for x in k]

        # Iterate over completion thresholds
        for threshold in thresholds:

            # Get inputs and labels
            X,y = prepare_data(clusters = clusters, 
                               scores = scores, 
                               demographics = demographics,
                               nk = nk,
                               threshold = threshold, 
                               return_features = False)

            # Filter for participants in the clusters being compared
            ind_subset = np.isin(y, k)
            X = X[ind_subset,:]
            y = y[ind_subset]

            # Binarize labels
            y[y == k[0]] = 0
            y[y == k[1]] = 1
            
            # Run PLS-DA and catch instances that throw warnings
            with warnings.catch_warnings(record = True) as w:
                df_results_k = run_plsda(X = X, y = y)
                
            # Populate data frame instance
            df_results_k['nk'] = nk
            df_results_k['cluster_id_1'] = cluster_ids[0]
            df_results_k['cluster_id_2'] = cluster_ids[1]
            df_results_k['threshold'] = threshold
            df_results_k['participants'] = X.shape[0]
            df_results_k['participants_1'] = sum(y == 0)
            df_results_k['participants_2'] = sum(y == 1)
            df_results_k['features'] = X.shape[1]
            df_results_k['warning'] = True if len(w) > 0 else False
            
            # Concatenate data frames
            df_results = pd.concat([df_results, df_results_k])

# Reset index
df_results = df_results.reset_index(drop = True)

# Export results
outfile = 'plsda_results.csv'
outfile = os.path.join(output_dir, outfile)
df_results.to_csv(outfile, index = False)

## Loadings plots

7-4 (green) vs 7-7 (none)
8-7 (green) vs 8-4 (orange)
9-4 (green) vs 9-5 (orange)
10-4 (green but no mouse match) vs 10-6 (orange but no mouse match)

In [9]:
cluster_pairs = [('7-4', '7-7'),
('8-7', '8-4'),
('9-4', '9-5'),
('10-4', '10-6')]

threshold = 0.6

palette = ['#4EEE94', '#DF7F4F']

xlims = [(-5, 4),
        (-4, 4),
        (-4, 5),
        (-4, 4)]
ylims = [(-3, 4),
        (-3, 4),
        (-3, 4),
        (-3, 4)]

In [11]:
for i, pair in enumerate(cluster_pairs):
# pair = cluster_pairs[0]

    nk = int(pair[0].split('-')[0])

    k = [int(x.split('-')[1]) for x in pair]
    k = [x-1 for x in k]

    # Get inputs and labels
    X,y,features = prepare_data(clusters = clusters, 
                       scores = scores, 
                       demographics = demographics,
                       nk = nk,
                       threshold = threshold, 
                       return_features = True)

    # Filter for participants in the clusters being compared
    ind_subset = np.isin(y, k)
    X = X[ind_subset,:]
    y = y[ind_subset]

    # Binarize labels
    y[y == k[0]] = 0
    y[y == k[1]] = 1

    export_scores_loadings(X = X, y = y, 
                           features = features, 
                           labels = pair)
    
#     outfile = 'plsda_{}_scores.png'.format('_'.join(pair))
#     outfile = os.path.join(output_dir, outfile)
#     plot_scores_loadings(X = X, y = y, features = features, labels = pair, palette = palette, 
#                      xlims = [-4, 4], ylims = [-3, 4],
#                      outfile = outfile)


---
# HBN analysis

## Data preparation

In [83]:
# Parameter set ID
params_id = '013'

# Output directory
output_dir = 'figure_supplementary'
output_dir = os.path.join(PROJECTPATH, 'figures', 'v3', output_dir)

# Input directories
registration_dir = 'data/human/registration/v3/'
pipeline_dir = 'data/human/derivatives/v3/'

registration_dir = os.path.join(PROJECTPATH, registration_dir)
pipeline_dir = os.path.join(PROJECTPATH, pipeline_dir, str(params_id))

# Demographics file
demographics = os.path.join(registration_dir, 'subject_info', 'demographics.csv')
demographics = pd.read_csv(demographics)

# Cluster solutions
cluster_dir = os.path.join(pipeline_dir, 'clusters', 'resolution_3.0')
cluster_file = os.path.join(cluster_dir, 'clusters.csv')
clusters = pd.read_csv(cluster_file)

# Behavioural scores directory 
behaviour_dir = os.path.join(registration_dir, 'subject_info', 'HBN', 'assessment_data')

What are the files that I care about?

In [70]:
# Dictionary of files to use
dict_files = dict(
    ACE = '9994_ACE_20220728.csv', 
     ARI_P = '9994_ARI_P_20220728.csv',
     ARI_S = '9994_ARI_S_20220728.csv',
     ASR = '9994_ASR_20220728.csv',
     ASSQ = '9994_ASSQ_20220728.csv',
     C3SR = '9994_C3SR_20220728.csv',
     CAARS = '9994_CAARS_20220728.csv',
     CBCL = '9994_CBCL_20220728.csv',
     CBCL_Pre = '9994_CBCL_Pre_20220728.csv',
     CDI2_P = '9994_CDI2_P_20220728.csv',
     CDI2_SR = '9994_CDI2_SR_20220728.csv',
     CIS_P = '9994_CIS_P_20220728.csv',
     CIS_SR = '9994_CIS_SR_20220728.csv',
     DTS = '9994_DTS_20220728.csv',
     EVT = '9994_EVT_20220728.csv',
     ICU_P = '9994_ICU_P_20220728.csv',
     ICU_SR = '9994_ICU_SR_20220728.csv',
     KBIT = '9994_KBIT_20220728.csv',
     MFQ_P = '9994_MFQ_P_20220728.csv',
     MFQ_SR = '9994_MFQ_SR_20220728.csv',
     PANAS = '9994_PANAS_20220728.csv',
     Quotient = '9994_Quotient_20220728.csv',
     RBS = '9994_RBS_20220728.csv',
     SAS = '9994_SAS_20220728.csv',
     SCARED_P = '9994_SCARED_P_20220728.csv',
     SCARED_SR = '9994_SCARED_SR_20220728.csv',
     SCQ = '9994_SCQ_20220728.csv',
     SDQ = '9994_SDQ_20220728.csv',
     SDS = '9994_SDS_20220728.csv',
     SRS = '9994_SRS_20220728.csv',
     SRS_Pre = '9994_SRS_Pre_20220728.csv',
     STAI = '9994_STAI_20220728.csv',
     SWAN = '9994_SWAN_20220728.csv',
     TRF = '9994_TRF_20220728.csv',
     TFR_Pre = '9994_TRF_Pre_20220728.csv',
     Vineland = '9994_Vineland_20220728.csv',
     WASI = '9994_WASI_20220728.csv',
     WHODAS_P = '9994_WHODAS_P_20220728.csv',
     WHODAS_SR = '9994_WHODAS_SR_20220728.csv',
     WISC = '9994_WISC_20220728.csv',
     YSR = '9994_YSR_20220728.csv'
)

# Dictionary of variables to use in each file
dict_vars = dict(ACE = ['ACE_Score'],
                  ARI_P = ['ARI_P_Total_Score'],
                  ARI_S = ['ARI_S_Total_Score'],
                  ASR = ['ASR_Int_T', 'ASR_Ext_T'], # Additional subscales 
                  ASSQ = ['ASSQ_Total'],
                  C3SR = ['C3SR_HY_T', 'C3SR_IN_T'], # Additional subscales
                  CAARS = ['CAARS_IM_T', 'CAARS_HR_T', 'CAARS_IE_T', 'CAARS_SC_T'])

dict_vars

{'ACE': ['ACE_Score'],
 'ARI_P': ['ARI_P_Total_Score'],
 'ARI_S': ['ARI_S_Total_Score'],
 'ASR': ['ASR_Int_T', 'ASR_Ext_T'],
 'ASSQ': ['ASSQ_Total'],
 'C3SR': ['C3SR_HY_T', 'C3SR_IN_T'],
 'CAARS': ['CAARS_IM_T', 'CAARS_HR_T', 'CAARS_IE_T', 'CAARS_SC_T']}

In [81]:
# Iterate over files and import variables
clusters_behaviours = clusters.copy()
for key, val in dict_files.items():
# key = 'ACE'
# val = dict_files[key]
    scores = pd.read_csv(os.path.join(behaviour_dir, val))
    scores = scores.iloc[1:]
    scores = scores[['EID'] + dict_vars[key]]
    clusters_behaviours = pd.merge(clusters_behaviours, scores, on = 'EID', how = 'left')

clusters_behaviours

KeyError: 'CBCL'

In [None]:
# POND clinical scores
scores = os.path.join(registration_dir, 'subject_info', 'POND', 'POND_clinical_scores_20230915.csv')
scores = pd.read_csv(scores)

# Cluster solutions
cluster_dir = os.path.join(pipeline_dir, 'clusters', 'resolution_3.0')
cluster_file = os.path.join(cluster_dir, 'clusters.csv')
clusters = pd.read_csv(cluster_file)

# Drop columns
cols_to_drop = ['Unnamed: 0', 'site', 'SUB_ID', 
                'DOB', 'PRIMARY_DIAGNOSIS', 
                'RESEARCH_CONFIRM_DIAG', 
                'HSHLD_INCOME_STD', 
                'PRMY_CGVR_STD',
               'SWANPDOC', 'TPOCSPDOC']
scores = scores.drop(cols_to_drop, axis = 1)

# Drop columns containing the following strings
strings_to_drop = ['NSI', 'ETHNCTY', 'EDUC']
for s in strings_to_drop:
    scores = scores.loc[:, ~scores.columns.str.contains(s)]

# Rename the subject ID column for merging
scores = scores.rename(columns = {'subject':'Subject_ID'})

# Assign NaN to missing values 999 code
for col, vals in scores.items():
    x = vals.copy()
    x[x == 999] = np.nan
    scores[col] = x