# Step 5: Analyze final Barnacle model

Use this notebook to compile and analyze the final version of your Barnacle model. This should be the version of the model that is fit with the optimal parameters you identified in step 4. This compilation and analysis includes several parts:
1. Align the components between bootstraps of your final model.
    - The order of components is not fixed in this tensor decomposition model. Therefore, in order to compare between bootstraps, the components must first be aligned to one another.
    - The aligned bootstraps will be saved as an xarray.DataSet so that you can access them for further analysis
1. Summarize the model weights for each component.
    - Each component can be understood to model a different pattern in the data. Depending on how you set up your data and your Barnacle model, each pattern might also be associated with a different cluster (e.g. gene clusters). This step separates out each component so you can more closely examine the pattern and/or cluster each is modeling.
1. Visualize your model.
    - Effective visualization depends on your data type, size, dimensions, and the questions you are asking. A few potential visualizations are suggested below to help get you started.

In [1]:
# imports

import itertools
import numpy as np
import pandas as pd
import tensorly as tl
import tlviz
import xarray as xr

from barnacle.tensors import SparseCPTensor
from barnacle.utils import subset_cp_tensor
from functools import reduce
from tlab.cp_tensor import load_cp_tensor
from tqdm.notebook import tqdm


### Part A: Align model bootstraps

In [2]:
# USER INPUTS -- edit these variables as needed

# path to directory where all of the models from your parameter search were saved (e.g. 'directory/fitting/'
modelpath = 'data/fitting'

# output directory where produced files will be saved (e.g. 'data/'
outdir = 'data'

# optimal rank parameter (number of components) used to fit your final model
optimal_rank = 5

# optimal lambda parameter (sparsity coefficient) used to fit your final model
optimal_lambda = 1.0

# number of bootstraps used for final model
n_bootstraps = 100


In [3]:
# align bootstraps of final model

# set up parameters and data structures
example_ds = xr.load_dataset(f"{modelpath}/bootstrap0/dataset-bootstrap0.nc")
replicates = [str(l) for l in set(example_ds.replicate_id.data)]
bootstraps = np.arange(n_bootstraps)
samplenames = {rep: [] for rep in replicates}    # sample names
cps = {rep: [] for rep in replicates}   # cp tensors with all samples present
subset_cps = {rep: [] for rep in replicates}    # cp tensors subset to just common samples

# collect sample names of each bootstrap/replicate pair
for boot in tqdm(bootstraps, desc='Extracting sample names'):
    for rep in replicates:
        ds = xr.open_dataset(f"{modelpath}/bootstrap{boot}/replicate{rep}/shuffled-replicate-{rep}.nc")
        samplenames[rep].append(ds.sample_id.data)
# compile set of samplenames common to all bootstrap / replicate splits
samplenames['common'] = reduce(np.intersect1d, itertools.chain.from_iterable([samplenames[r] for r in replicates]))

# import all fitted models
for boot in tqdm(bootstraps, desc='Importing model bootstraps'):
    for rep in replicates:
        # put together data path
        path_cp = f"bootstrap{boot}/replicate{rep}/rank{optimal_rank}/lambda{optimal_lambda}/fitted-model.h5"
        # store normalized cp tensor to cps
        cp = tl.cp_normalize(load_cp_tensor(f"{modelpath}/{path_cp}"))
        cps[rep].append(cp)
        # pull out common samplenames and store in subset_aligned_cps
        idx = np.where(np.isin(samplenames[rep][boot], samplenames['common']))[0]
        subset_cps[rep].append(subset_cp_tensor(cp, {2: idx}))
print(f"Successfully imported {len(cps[rep])} model bootstraps, each with {len(replicates)} replicates.")

# find best representative reference cp tensor
results = []
combos = list(itertools.product(replicates, bootstraps))
for ref_rep, ref_boot in tqdm(combos, desc='Identifying best reference model from bootstraps'):
    # limit comparisons to a random sample of 100 bootstraps
    if len(combos) > 100:
        combos = [combos[i] for i in np.random.choice(len(combos), size=100, replace=False)]
    for comp_rep, comp_boot in combos:
        # no point in comparing to self
        if ref_rep == comp_rep and ref_boot == comp_boot:
            continue
        reference_cp = subset_cps[ref_rep][ref_boot]
        comparison_cp = subset_cps[comp_rep][comp_boot]
        fms = tlviz.factor_tools.factor_match_score(reference_cp, comparison_cp, consider_weights=False)
        results.append({
            'reference_bootstrap': ref_boot, 
            'reference_replicate': ref_rep, 
            'comparison_bootstrap': comp_boot, 
            'comparison_replicate': comp_rep, 
            'fms': fms, 
        })
# summarize overall mean fms  
fms_df = pd.DataFrame(results)
fms_summary_df = fms_df.groupby([
    'reference_bootstrap', 
    'reference_replicate'
]).agg(
    mean_fms=('fms', 'mean'), 
    median_fms=('fms', 'median'), 
).reset_index()
# find the best representative bootstrap model based on maximum mean FMS
best_ref = fms_summary_df.iloc[fms_summary_df.mean_fms.idxmax(), :]
print('All bootstraps will be aligned to the following reference model:')
display(pd.DataFrame(best_ref).T.reset_index(drop=True))

# permute reference cp so that components are in descending order of explaned variation
ref_cp = tlviz.factor_tools.permute_cp_tensor(
    subset_cps[best_ref['reference_replicate']][best_ref['reference_bootstrap']], 
    consider_weights=False
)        

# realign all the other cp tensors against the best representative cp tensor
for rep in replicates:
    for boot in bootstraps:
        # permute components to line up with best representative reference cp
        perm = tlviz.factor_tools.get_cp_permutation(subset_cps[rep][boot], reference_cp_tensor=ref_cp, consider_weights=False)
        cps[rep][boot] = tlviz.factor_tools.permute_cp_tensor(cps[rep][boot], permutation=perm)
        subset_cps[rep][boot] = tlviz.factor_tools.permute_cp_tensor(subset_cps[rep][boot], permutation=perm)
print('All model bootstraps successfully aligned.')


Extracting sample names:   0%|          | 0/100 [00:00<?, ?it/s]

Importing model bootstraps:   0%|          | 0/100 [00:00<?, ?it/s]

Successfully imported 100 model bootstraps, each with 3 replicates.


Identifying best reference model from bootstraps:   0%|          | 0/300 [00:00<?, ?it/s]

All bootstraps will be aligned to the following reference model:


Unnamed: 0,reference_bootstrap,reference_replicate,mean_fms,median_fms
0,56,C,0.645476,0.654084


All model bootstraps successfully aligned.


In [4]:
# compile aligned model weights into a single xarray.Dataset

# set up data structures
component_labels = np.arange(optimal_rank) + 1 # 1-based indexing for ease of communication
weights = {'component': [], 'mode0': [], 'mode1': []}
sample_info_df = pd.merge(
    example_ds.sample_id.to_series().reset_index(), 
    example_ds.replicate_id.to_series().reset_index(), 
    on='sample_replicate_id', how='inner'
)[['sample_id', 'replicate_id']].rename(columns={'sample_id': 'sample', 'replicate_id': 'replicate'})
sample_df = pd.DataFrame()

# pull model weights from each bootstrap
for boot in bootstraps:
    boot_sample_df = pd.DataFrame()
    for key in weights.keys():
        weights[key].append([])
    for rep in replicates:
        # fetch aligned cp tensor
        cp = cps[rep][boot]
        # add component weights to list
        weights['component'][boot].append(cp.weights)
        # add mode 0 weights to list
        weights['mode0'][boot].append(cp.factors[0].T)
        # add mode 1 weights to list
        weights['mode1'][boot].append(cp.factors[1].T)
        # put mode 2 (sample) weights into a pd.DataFrame
        rep_sample_df = pd.DataFrame(
            cp.factors[2], index=samplenames[rep][boot], columns=component_labels
        ).reset_index().rename(columns={'index': 'sample'})
        rep_sample_df['replicate'] = rep
        # concatenate sample weights of all replicates
        boot_sample_df = pd.concat([boot_sample_df, rep_sample_df])
    # merge sample info into sample weights dataframe
    boot_sample_df = pd.merge(left=sample_info_df, right=boot_sample_df, on=['sample', 'replicate'], how='left')
    boot_sample_df['bootstrap'] = boot
    # concatenate sample weights of all bootstraps
    sample_df = pd.concat([sample_df, boot_sample_df])

# compile everything into an xarray.Dataset
modes = list(example_ds.coords)
ds = xr.Dataset({
    'component_weights': xr.DataArray(
            np.array(weights['component']), 
            coords=[bootstraps, replicates, component_labels], 
            dims=['bootstrap', 'replicate', 'component']
    ),
    f"{modes[0]}_weights": xr.DataArray(
        np.array(weights['mode0']), 
        coords=[bootstraps, replicates, component_labels, example_ds[modes[0]].data], 
        dims=['bootstrap', 'replicate', 'component', modes[0]]
    ), 
    f"{modes[1]}_weights": xr.DataArray(
        np.array(weights['mode1']), 
        coords=[bootstraps, replicates, component_labels, example_ds[modes[1]].data], 
        dims=['bootstrap', 'replicate', 'component', modes[1]]
    ), 
    'sample_weights': xr.DataArray.from_series(
        sample_df.melt(
            id_vars=['bootstrap', 'replicate', 'sample'], 
            value_vars=component_labels, 
            var_name='component', 
            value_name='sample_weights'
        ).set_index(['bootstrap', 'replicate', 'component', 'sample'])['sample_weights']
    )

})

# add reference tensor, rank, and sparsity coefficient as attributes
ds.attrs['rank'] = optimal_rank
ds.attrs['lambda'] = optimal_lambda
ds.attrs['n_bootstraps'] = n_bootstraps
ds.attrs['align_ref_bootstrap'] = best_ref['reference_bootstrap']
ds.attrs['align_ref_replicate'] = best_ref['reference_replicate']

# save Dataset as netCDF4 file
ds.to_netcdf(f"{outdir}/aligned-models.nc")

# examine Dataset
ds

### Part B: Summarize model weights for each component

### Part C: Visualization