In [1]:
# imports

import itertools
import numpy as np
import pandas as pd
import tensorly as tl
import tlviz
import xarray as xr

from barnacle.tensors import SparseCPTensor
from barnacle.utils import subset_cp_tensor
from functools import reduce
from pathlib import Path
from tlab.cp_tensor import load_cp_tensor


# Import and organize all bootstrapped models

In [2]:
# set up data structures to store input data

# parameters
datapath = Path('../../data/4-fitting/')
outdir = Path('../../data/5-models/')
bootstraps = np.arange(100)
replicates = ['A', 'B', 'C']
ranks = {
    'pro': 20, 
    'syn': 15
}
lambdas = {
    'pro': 16., 
    'syn': 16.
}

# data structure to store sample name data
samplenames = {
    'pro' : {
        rep: [] for rep in replicates
    },
    'syn' : {
        rep: [] for rep in replicates
    }
}

# cp tensors as stored on computer
cps = {
    'pro' : {
        rep: [] for rep in replicates
    },
    'syn' : {
        rep: [] for rep in replicates
    }
}

# cp tensors aligned to best representative, all samples present
aligned_cps = {
    'pro' : {
        rep: [] for rep in replicates
    },
    'syn' : {
        rep: [] for rep in replicates
    }
}

# cp tensors aligned to best representative, subset to just common samples
subset_aligned_cps = {
    'pro' : {
        rep: [] for rep in replicates
    },
    'syn' : {
        rep: [] for rep in replicates
    }
}


In [3]:
# fetch samplename labels for each shuffled replicate set

# collect sample names of each bootstrap/replicate pair
for cyano in ['pro', 'syn']: 
    for rep in replicates:
        for boot in bootstraps:
            filepath = datapath / '{}/bootstrap{}/replicate{}/shuffled_replicate_{}.nc'.format(
                cyano, boot, rep, rep
            )
            ds = xr.open_dataset(filepath)
            samplenames[cyano][rep].append(ds.samplename.data)

    # compile set of samplenames common to all bootstrap / replicate splits
    samplenames[cyano]['common'] = reduce(
        np.intersect1d, 
        itertools.chain.from_iterable([samplenames[cyano][r] for r in replicates])
    )

# import one shuffled tensor dataset of each cyano for reference
pro_ds = xr.open_dataset(datapath / 'pro/bootstrap0/dataset_bootstrap_0.nc')
syn_ds = xr.open_dataset(datapath / 'syn/bootstrap0/dataset_bootstrap_0.nc')

syn_ds


In [4]:
# import all fitted models, and subset them to just the common samplenames

for cyano in ['pro', 'syn']:
    for rep in replicates:
        for boot in bootstraps:
            rank = ranks[cyano]
            lamb = lambdas[cyano]
            # put together data path
            path_cp = '{}/bootstrap{}/replicate{}/rank{}/lambda{}/fitted_model.h5'.format(
                cyano, boot, rep, rank, lamb
            )
            # store normalized cp tensor to cps
            cp = tl.cp_normalize(load_cp_tensor(datapath / path_cp))
            cps[cyano][rep].append(cp)
            # pull out common samplenames and store in subset_aligned_cps
            idx = np.where(np.isin(samplenames[cyano][rep][boot], samplenames[cyano]['common']))[0]
            subset_aligned_cps[cyano][rep].append(
                subset_cp_tensor(cp, {2: idx})
            )
            
subset_aligned_cps['syn']['A'][99]


(weights, factors) : rank-15 CPTensor of shape (6161, 6, 55)

# Align models against best representative reference

Within each dataset, all models will be aligned against a single best representative reference model. The best representative model is selected as the bootstrap with the max mean FMS score, evaluated against all other bootstraped models in the dataset.

In [5]:
# find best representative reference cp tensor for each genus

results = []
for cyano, ref_rep, ref_boot in list(itertools.product(['pro', 'syn'], replicates, bootstraps)):
    reference_cp = subset_aligned_cps[cyano][ref_rep][ref_boot]
    for comp_rep, comp_boot in list(itertools.product(replicates, bootstraps)):
        # no point in comparing to self
        if ref_rep == comp_rep and ref_boot == comp_boot:
            continue
        comparison_cp = subset_aligned_cps[cyano][comp_rep][comp_boot]
        fms = tlviz.factor_tools.factor_match_score(
            reference_cp, 
            comparison_cp, 
            consider_weights=False
        )
        results.append({
            'genus': cyano, 
            'reference_bootstrap': ref_boot, 
            'reference_replicate': ref_rep, 
            'comparison_bootstrap': comp_boot, 
            'comparison_replicate': comp_rep, 
            'fms': fms, 
        })
        
fms_df = pd.DataFrame(results)

# summarize overall mean fms
fms_summary_df = fms_df.groupby([
    'genus', 
    'reference_bootstrap', 
    'reference_replicate'
]).agg(
    mean_fms=('fms', 'mean'), 
    median_fms=('fms', 'median'), 
    boot_count=('fms', 'count')
).reset_index()

# find the top mean for each comparison rank
best_rep_df = fms_summary_df.iloc[fms_summary_df.groupby([
    'genus', 
]).mean_fms.idxmax().values, :]

best_rep_df


Unnamed: 0,genus,reference_bootstrap,reference_replicate,mean_fms,median_fms,boot_count
93,pro,31,A,0.653508,0.658445,299
423,syn,41,A,0.640135,0.633337,299


In [6]:
# realign all models against best representative models

# permute reference cps so that components are in descending order of explaned variation
ref_cps = {}
for cyano in ['pro', 'syn']:
    stats = best_rep_df.loc[best_rep_df.genus == cyano, :].to_dict(orient='records')[0]
    ref_cp = subset_aligned_cps[cyano][stats['reference_replicate']][stats['reference_bootstrap']]
    ref_cps[cyano] = tlviz.factor_tools.permute_cp_tensor(
        ref_cp, 
        consider_weights=False
    )

# realign all the other cp tensors against the best representative cp tensor
for cyano in ['pro', 'syn']:
    ref_cp = ref_cps[cyano]
    for rep in replicates:
        for boot in bootstraps:
            # permute components to line up with best representative reference cp
            perm = tlviz.factor_tools.get_cp_permutation(
                subset_aligned_cps[cyano][rep][boot], 
                reference_cp_tensor=ref_cp, 
                consider_weights=False
            )
            subset_aligned_cps[cyano][rep][boot] = tlviz.factor_tools.permute_cp_tensor(
                subset_aligned_cps[cyano][rep][boot], 
                permutation=perm
            )
            aligned_cps[cyano][rep].append(tlviz.factor_tools.permute_cp_tensor(
                cps[cyano][rep][boot], 
                permutation=perm
            ))

print(cps['syn']['A'][99])
print(aligned_cps['syn']['A'][99])
print(subset_aligned_cps['syn']['A'][99])


(weights, factors) : rank-15 CPTensor of shape (6161, 6, 73)
(weights, factors) : rank-15 CPTensor of shape (6161, 6, 73)
(weights, factors) : rank-15 CPTensor of shape (6161, 6, 55)


# Compile aligned model weights into a single xarray dataset

In [7]:
# compile aligned model weights into xarray.Datasets

for cyano in ['pro', 'syn']:
    # set up data structures
    component_labels = [f'component{i+1}' for i in np.arange(ranks[cyano])]
    component_weights = []
    ortholog_weights = []
    taxon_weights = []
    sample_df = pd.DataFrame()
    for boot in bootstraps:
        component_weights.append([])
        ortholog_weights.append([])
        taxon_weights.append([])
        boot_sample_df = pd.DataFrame()
        for rep in replicates:
            # fetch shuffled tensor xr.DataSet
            ds = xr.open_dataset(datapath / f'{cyano}/bootstrap{boot}/dataset_bootstrap_{boot}.nc')
            # fetch aligned cp tensor
            cp = aligned_cps[cyano][rep][boot]
            # add component weights to list
            component_weights[boot].append(cp.weights)
            # add gene weights to list
            ortholog_weights[boot].append(cp.factors[0].T)
            # add taxon weights to list
            taxon_weights[boot].append(cp.factors[1].T)
            # put sample weights into a pd.DataFrame
            rep_sample_df = pd.DataFrame(
                cp.factors[2], index=samplenames[cyano][rep][boot], columns=component_labels
            ).reset_index().rename(columns={'index': 'samplename'})
            rep_sample_df['replicate'] = rep
            # concatenate sample weights of all replicates
            if len(boot_sample_df) == 0:
                boot_sample_df = rep_sample_df
            else:
                boot_sample_df = pd.concat([boot_sample_df, rep_sample_df])
        # merge sample id from xr.DataSet into pd.DataFrame
        boot_sample_df = pd.merge(
            left=ds[['samplename', 'replicate']].to_pandas().reset_index(), 
            right=boot_sample_df, 
            on=['samplename', 'replicate'],how='left'
        )
        boot_sample_df['bootstrap'] = boot
        # concatenate sample weights of all bootstraps
        if len(sample_df) == 0:
            sample_df = boot_sample_df
        else:
            sample_df = pd.concat([sample_df, boot_sample_df])

    # compile everything into an xarray.Dataset
    ds = xr.Dataset(
        dict(
            componentweights=xr.DataArray(
                np.array(component_weights), 
                coords=[bootstraps, replicates, component_labels], 
                dims=['bootstrap', 'replicate', 'component']
            ),
            geneweights=xr.DataArray(
                np.array(ortholog_weights), 
                coords=[bootstraps, replicates, component_labels, ds.ortholog.data], 
                dims=['bootstrap', 'replicate', 'component', 'ortholog']
            ), 
            taxonweights=xr.DataArray(
                np.array(taxon_weights), 
                coords=[bootstraps, replicates, component_labels, ds.clade.data], 
                dims=['bootstrap', 'replicate', 'component', 'clade']
            ), 
            sampleweights=xr.DataArray.from_series(
                sample_df.melt(
                    id_vars=['bootstrap', 'replicate', 'samplename'], 
                    value_vars=component_labels, 
                    var_name='component', 
                    value_name='weight'
                ).set_index(['bootstrap', 'replicate', 'component', 'samplename'])['weight']
            ), 
            sample=xr.DataArray.from_series(
                sample_df.set_index(['bootstrap', 'replicate', 'samplename'])['sample']
            )
        )
    )
    
    # save Dataset as netCDF4 file
    ds.to_netcdf(outdir / f'{cyano}-aligned-models.nc')
    
    # assign each dataset to its own variable
    if cyano == 'pro':
        pro_ds = ds
    elif cyano == 'syn':
        syn_ds = ds

# examine Pro Dataset
pro_ds


In [8]:
# examine Syn Dataset

syn_ds
