In [None]:
import pandas as pd
import numpy as np
import os
import BPt as bp

from abcd_tools.utils.io import load_tabular
from abcd_tools.utils.ConfigLoader import load_yaml

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import ElasticNet
# import missingno as msno


In [None]:
params = load_yaml("../parameters.yaml")

In [None]:
phenotype = load_tabular(params["phenotype_path"])
phenotype

In [None]:
predictors = load_tabular(params["behavioral_path"])
predictors

In [None]:
df = predictors.join(phenotype, how='inner')
df.head()

In [None]:
# msno.matrix(df)

The BPM has too much missingness to be useful, here. 

In [None]:
phenotype = phenotype.loc[:,~phenotype.columns.str.startswith('bpm')]
df = predictors.join(phenotype, how='inner')

In [None]:
def get_phenotype_scopes(predictors: pd.DataFrame, fpath: str, params: dict) -> dict:

    empirical_predictors = ['correct_go_mrt', 'correct_go_stdrt', 'issrt']
    covariates = params['covariates']
    category = params['categorical']
    predictors = predictors.loc[:, ~predictors.columns.isin(covariates)]

    rdex_predictors = (predictors.loc[:, ~predictors.columns.isin(empirical_predictors)]
                        .columns
                        .tolist())

    scopes = {
        'category'  : category,
        'covariates': covariates,
        'empirical': empirical_predictors,
        'rdex': rdex_predictors,
        'rdex + empirical': predictors.columns.tolist()
    }

    if fpath:
        pd.to_pickle(scopes, fpath + 'phenotype_prediction_scopes.pkl')

    return scopes

# scopes = get_phenotype_scopes(predictors)

In [None]:
# rdex_prediction_ds = pd.read_pickle(params["rdex_prediction_dataset_path"])
# rdex_prediction_ds

In [None]:
def prepare_phenotype_dataset(rdex_ds: bp.Dataset, predictors: pd.DataFrame, 
    phenotype: pd.DataFrame, params: dict, fpath: str) -> bp.Dataset:

    # gather index from rdex prediction dataset
    rdex_ds_train = rdex_ds.train_subjects
    rdex_ds_test = rdex_ds.test_subjects
    all_subjects = rdex_ds_train.append(rdex_ds_test)

    # exclude bpm columns from phenotype -- too much missingness
    phenotype = phenotype.loc[:,~phenotype.columns.str.startswith('bpm')]
    df = predictors.join(phenotype, how='inner')

    # limit to subjects in rdex dataset
    df = df.loc[df.index.isin(all_subjects)]

    scopes = get_phenotype_scopes(predictors, fpath, params)
    targets = phenotype.loc[:, ~phenotype.columns.isin(params['covariates'])].columns

    ds = bp.Dataset(df, targets=targets)
    ds = ds.set_train_split(subjects=rdex_ds_train)

    for k, v in scopes.items():
        ds.add_scope(v, k, inplace=True)

    ds = ds.ordinalize(scope='category')
    ds = ds.dropna()

    if fpath:
        ds.to_pickle(fpath + 'phenotype_prediction_dataset.pkl')

    return ds

# ds = prepare_phenotype_dataset(rdex_prediction_ds, predictors, phenotype, params, fpath=params['phenotype_input_dir'])
# ds

In [None]:
# ds['category']

In [None]:
def define_phenotype_prediction_pipeline() -> bp.Pipeline:
   
    # Just scale float type features
    scaler = bp.Scaler('robust', scope='float')
    normalizer = bp.Scaler('normalize', scope='float')


    # Define regression model
    mod_obj=ElasticNet()
    mod_params = {
        'alpha': bp.p.Log(lower=1e-5, upper=1e5),
        'l1_ratio': bp.p.Scalar(lower=0.001, upper=1).set_mutation(sigma=0.165)}
    param_search = bp.ParamSearch('HammersleySearch', n_iter=100, cv='default')

    model = bp.Model(
        obj=mod_obj, 
        params=mod_params,  
        param_search=param_search
    )

    # Then define full pipeline
    pipe = bp.Pipeline([scaler, normalizer, model])

    return pipe

def fit_phenotype_prediction_model(ds: bp.Dataset, scopes: dict, n_cores=1, random_state=42) -> bp.CompareDict:

    pipe = define_phenotype_prediction_pipeline()
    cv = bp.CV(splits=5, n_repeats=1)
    ps = bp.ProblemSpec(n_jobs=n_cores, random_state=random_state)

    compare_scopes = []
    for key in scopes.keys():
        compare_scopes.append(bp.Option(['covariates', key], name=key))

    results = bp.evaluate(pipeline=pipe,
                      dataset=ds,
                      problem_spec=ps,
                      scope=bp.Compare(compare_scopes),
                      target=bp.Compare(ds.get_cols('target')),
                      mute_warnings=True,
                      cv=cv)

    return results

def save_model_results(res: bp.CompareDict, name: str, model: str, path: str) -> None:
    """Save model results to disk.

    Args:
        res (bp.CompareDict): Model results.
        name (str): Model name.
        model (str): Model type.
        path (str): Path to save results.
    """
    pd.to_pickle(res, f'{path}/{name}_{model}_results.pkl')

    summary = res.summary()
    summary.to_csv(f'{path}/{name}_{model}_summary.csv')

    print(f"Results saved to {path}")


# n_cores = os.cpu_count() - 2
# res = fit_phenotype_prediction_model(ds, scopes, n_cores=n_cores, random_state=42)

In [None]:
# save_model_results(res, 'phenotype_prediction', 'elastic_net', params['phenotype_output_dir'])

Run on HPC

In [None]:
def get_full_summary(res: bp.CompareDict) -> pd.DataFrame:
    """Helper to get full summary information from BPt models.

    Args:
        res (bp.CompareDict): Model results.
    
    Returns:
        pd.DataFrame: Full summary information
    """


    keys = list(res.keys())
    repr_key = keys[0]
    option_keys = [o.key for o in repr_key.options]
    cols = {key: [] for key in option_keys}

    score_cols = []

    for key in keys:
        for option in key.options:
            cols[option.key].append(option.name)
        
        evaluator = res[key]
        
        attr = getattr(evaluator, 'scores')
        new_col_names = []
        for key in attr:
        
            val = attr[key]

            new_col_name = 'scores' + '_' + key
            new_col_names.append(new_col_name)
        
            try:
                cols[new_col_name].append(val)
            except KeyError:
                cols[new_col_name] = [val]

    s = pd.DataFrame.from_dict(cols)
    return s.explode(new_col_names)

In [None]:
res = pd.read_pickle(params['phenotype_output_dir'] + '/phenotype_elastic_results.pkl')
# summary = pd.read_csv(params['phenotype_output_dir'] + '/phenotype_elastic_summary.csv')


def make_phenotype_plot_df(res: bp.CompareDict, params: dict) -> pd.DataFrame:
    """Make phenotype plot dataframe.

    Args:
        res (bp.CompareDict): Model results.
        params (dict): Parameters.
    
    Returns:
        pd.DataFrame: Phenotype plot dataframe.
    """

    summary = get_full_summary(res)

    item_map = params['phenotype_plot_name_keyed']
    grouping_map = params['grouping_map']

    summary = summary.replace(item_map)
    summary = summary.replace(grouping_map)

    tmp = summary['target'].str.split(':', expand=True)
    tmp.columns = ['grouping', 'item']
    summary = pd.concat([summary, tmp], axis=1)

    summary = summary.sort_values(['grouping', 'scope', 'scores_r2'], ascending=False)

    return summary

plot_df = make_phenotype_plot_df(res, params)

In [None]:
def make_phenotype_effectsize_plot(plot_df: pd.DataFrame, params: dict) -> None:
    """Make phenotype effectsize plot.

    Args:
        plot_df (pd.DataFrame): Plot dataframe.
        params (dict): Parameters.
    """
    
    sns.set(style='whitegrid', font_scale=2)

    g = sns.FacetGrid(plot_df, col='grouping', height=10, sharex=False)
    g.map(sns.barplot, 'item', 'scores_r2', 'scope', palette='viridis')
    g.set_xticklabels(rotation=45, ha='right')
    g.set_titles('{col_name}')
    g.set_axis_labels('', '$R^2$')
    g.add_legend(title='')

    plt.savefig(params['phenotype_plot_output_dir'] + '/phenotype_effectsize_plot.png', bbox_inches='tight', dpi=300)
    # plt.show()

make_phenotype_effectsize_plot(plot_df, params)

In [None]:
def gather_phenotype_fis(res: bp.CompareDict, params: dict) -> pd.DataFrame:
    """Gather phenotype feature importance scores.

    Args:
        res (bp.CompareDict): Model results.
        params (dict): Parameters.

    Returns:
        pd.DataFrame: Phenotype feature importance scores.
    """
    
    item_map = params['phenotype_plot_name_keyed']
    grouping_map = params['grouping_map']

    keys = list(res.keys())
    fis = pd.DataFrame()
    for key in keys:
        tmp = res[key].get_fis()
        scope = str(key.options[0]).replace('scope=', '')
        target = str(key.options[1]).replace('target=', '')

        tmp.insert(0, 'scope', scope)
        tmp.insert(1, 'target', target)

        fis = pd.concat([fis, tmp])
        
    fis = fis.replace(item_map)
    fis = fis.replace(grouping_map)
    
    tmp = fis['target'].str.split(':', expand=True)
    tmp.columns = ['grouping', 'item']
    fis = pd.concat([fis, tmp], axis=1)
    
    return fis
fis = gather_phenotype_fis(res, params)


In [None]:
def make_average_fis(fis: pd.DataFrame, params: dict) -> pd.DataFrame:
    """Make average feature importance scores.

    Args:
        fis (pd.DataFrame): Feature importance scores.
        params (dict): Parameters.
    
    Returns:
        pd.DataFrame: Average feature importance scores.
    """

    covars = params['covariates']
    target_map = params['target_map']
  
    fis = fis.drop(columns=covars)

    fisummary = fis.groupby(['grouping', 'scope']).mean().reset_index()
    fisummary = fisummary.melt(id_vars=['grouping','scope'], 
                                var_name='feature', 
                                value_name='importance')
                                
    fisummary = fisummary.replace(target_map)
    return fisummary

avg_fis = make_average_fis(fis, params)
avg_fis

In [None]:
avg_fis['grouping'].unique()

In [None]:
avg_fis['scope'].unique()

In [None]:
avg_fis

In [None]:
def make_feat_imp_radar_plot(df, ax, legend=True):
    """Make feature importance radar plot.

    Args:
        df (pd.DataFrame): Dataframe.
        ax (plt.Axes): Axes.
        legend (bool, optional): Legend. Defaults to True.
    """

    df = df.dropna()
    variables = pd.unique(df['feature'])
    N = len(variables)

    categories = df['grouping'].unique()
    colors = ['#1f77b4','#aec7e8','#ff7f0e']

    radians = 2 * np.pi
    angles = [n / float(N) * radians for n in range(N)]
    angles += angles[:1]

    # instantiate plot
    ax.set_xticks(angles[:-1], variables)
    ax.set_rlabel_position(10)

    # plot circle to show 0
    rads = np.arange(0, (2 * np.pi), 0.01)
    zeros = np.zeros(len(rads))
    ax.plot(rads, zeros, 'k', alpha=.5)

    # set grid
    ax.grid(True)
    ax.spines['polar'].set_visible(False)

    for category, color in zip(categories, colors):

        tmp = df[df['grouping'] == category]

        values = tmp['importance'].reset_index(drop=True).values

        values = np.append(values, values[0])

        ax.plot(angles, values, color=color)

    if legend:
        legend_labels = np.insert(categories, 0, 'Reference = 0')
        ax.legend(legend_labels, bbox_to_anchor=(0, 1.05))


def phenotype_feat_important_collage(avg_fis: pd.DataFrame, params: dict) -> None:
    """Make phenotype feature importance collage.

    Args:
        avg_fis (pd.DataFrame): Average feature importance scores.
        params (dict): Parameters.
    """

    sns.set_theme()
    sns.set(style='whitegrid', font_scale=1)

    scopes = params['radard_plot_scopes']

    fig, ax = plt.subplots(ncols=len(scopes), figsize=(25,25), subplot_kw={'projection': 'polar'})

    for i, scope in enumerate(scopes):
        legend = True if i == len(scopes)-1 else False
        make_feat_imp_radar_plot(avg_fis[avg_fis['scope'] == scope], ax[i], legend=legend)
        ax[i].set_title(scope)

    plt.savefig(params['phenotype_plot_output_dir'] + '/phenotype_feat_imp_radar_plot.png', bbox_inches='tight', dpi=300)

phenotype_feat_important_collage(avg_fis, params)