In [2]:
using DrWatson
@quickactivate "Doran_etal_2022"

using SPI
using Muon
using CSV, DataFrames
using Distances

In [3]:
uniprot = readh5ad(joinpath(datadir(), "exp_pro", "UP7047", "2020_02_UP7047.h5ad"));
UPusv = SVD(uniprot.obsm["LSVs"][:, :], uniprot.uns["SVs"][:], uniprot.varm["RSVs"][:, :]');
bb = CSV.read(
    joinpath(datadir(), "exp_raw", "BB673", "commensal_metabolomics_ogg_matrix_20220307.csv"), 
    DataFrame; 
    missingstring="NA");

In [4]:
BBIDS = join.(split.(bb.unique_id, r"\[|\]")) |> x->join.(split.(x, r" "), "_");
BBtaxa = bb[!,  2:7]
BBtaxa[!,:donor] = last.(first.(split.(BBIDS, "."),2));
BBMfc = bb[:,  8:45]
BB4mM = bb[:, 46:49]
BBMmM = bb[!, 46:55]
BBOGG = bb[!, 56:end]
ZBBMfc = SPI.zscore(Matrix(coalesce.(BBMfc, 1.0)))
ZBBMmM = SPI.zscore(Matrix(coalesce.(BBMmM, 0.0)))
ZBB4mM = SPI.zscore(Matrix(coalesce.(BB4mM, 0.0)))
bbgenenames = names(BBOGG);

In [5]:
biobank_ogg = AnnData(X=Matrix(BBOGG),obs_names=BBIDS,var_names=bbgenenames)
biobank_metabolite_rel = AnnData(X=ZBBMfc,obs_names=BBIDS,var_names=names(BBMfc))
biobank_metabolite_rel.layers["raw"] = coalesce.(Matrix(BBMfc), 1.0)
biobank_metabolite_mM = AnnData(X=ZBBMmM,obs_names=BBIDS,var_names=names(BBMmM))
biobank_metabolite_mM.layers["raw"] = coalesce.(Matrix(BBMmM), 0.0)
biobank_metabolite_mM.uns["4keymetabolites"] = names(BB4mM)
biobank = MuData(mod=Dict(
    "oggs"=>biobank_ogg, 
    "metabolites_foldchange"=>biobank_metabolite_rel,
    "metabolites_millimolar"=>biobank_metabolite_mM,
    ))
biobank.obs = coalesce.(BBtaxa, "")
biobank

MuData object 673 ✕ 11296
└ metabolites_foldchange
  AnnData object 673 ✕ 38
└ metabolites_millimolar
  AnnData object 673 ✕ 10
└ oggs
  AnnData object 673 ✕ 11248

In [6]:
names(BB4mM)

4-element Vector{String}:
 "Butyrate_mM"
 "Propionate_mM"
 "Succinate_mM"
 "Acetate_mM"

In [7]:
# Match BioBank measurements to column order of UniProt
bbmtx = zeros(size(BBOGG,1), length(uniprot.var_names.vals));
rawidxs = indexin(bbgenenames, uniprot.var_names.vals)
bbmask = .!isnothing.(rawidxs)
uporderedmask = filter(x->.!isnothing(x), rawidxs);
bbmtx[:, uporderedmask] .= BBOGG[:, bbmask];

In [8]:
@show size(bbmtx);

size(bbmtx) = (673, 10177)


In [9]:
BBUhat = projectinLSV(bbmtx, UPusv);

In [10]:
@time BBDij = calc_spi_mtx(BBUhat, UPusv.S, getintervals(UPusv.S));

  4.133192 seconds (4.23 M allocations: 6.234 GiB, 28.41% gc time, 31.46% compilation time)


In [14]:
bbhc = hclust(BBDij, linkage=:average, branchorder=:optimal);

In [15]:
newicktreestring = nwstr(bbhc, BBIDS; labelinternalnodes=false);

In [13]:
BBoggeuclidean = Distances.pairwise(Euclidean(), biobank["oggs"].X[:,:]');
BBoggeuclidean_hc = hclust(BBoggeuclidean, linkage=:average, branchorder=:optimal);

In [16]:
biobank["oggs"].obsm["inferredLSVs_UP7047"] = BBUhat
biobank["oggs"].obsp["inferredSPIdist_UP7047"] = BBDij
biobank["oggs"].uns["inferrednewicktree_UP7047"] = newicktreestring;

In [17]:
biobank["oggs"].obsp["ogg_euclidean_dij"] = BBoggeuclidean
biobank["oggs"].uns["ogg_euclidean_newicktreestring"] = nwstr(BBoggeuclidean_hc, BBIDS; labelinternalnodes=false);

In [18]:
mkpath(joinpath(datadir(), "exp_pro", "BB673"))
writeh5mu(joinpath(datadir(), "exp_pro", "BB673", "BB673.h5mu"), biobank)